knitr::opts_chunk$set(echo = TRUE)

INSTALLING LIBRARIES

# install.packages("Hmisc")
# install.packages("pastecs")
# install.packages("ggplot2")
# install.packages("Hmisc")
# install.packages("fastDummies")
# install.packages("lmtest")
# install.packages("lmtest")
# install.packages("caretEnsemble")
# install.packages("Amelia")
# install.packages("GGally")
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(leaps)
library(reshape2)
library(gridExtra)
library(fastDummies)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(pastecs)
library(skimr)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.6      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.1 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()     masks ggplot2::%+%()
## ✖ psych::alpha()   masks ggplot2::alpha()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ tidyr::expand()  masks Matrix::expand()
## ✖ tidyr::extract() masks pastecs::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::first()   masks pastecs::first()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::last()    masks pastecs::last()
## ✖ purrr::lift()    masks caret::lift()
## ✖ tidyr::pack()    masks Matrix::pack()
## ✖ tidyr::unpack()  masks Matrix::unpack()
library(caret)
library(caretEnsemble)
## 
## Attaching package: 'caretEnsemble'
## 
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
library(psych)
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(rpart)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(nnet)
library(ROCR)
library(Metrics)
## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(caret)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## 
## Attaching package: 'forecast'
## 
## The following object is masked from 'package:Metrics':
## 
##     accuracy
## 
## The following object is masked from 'package:caretEnsemble':
## 
##     autoplot
library(rpart)
library(rattle)
## Loading required package: bitops
## 
## Attaching package: 'bitops'
## 
## The following object is masked from 'package:Matrix':
## 
##     %&%
## 
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## 
## The following object is masked from 'package:randomForest':
## 
##     importance
library(ggplot2)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact
library(rlist)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following object is masked from 'package:Metrics':
## 
##     auc
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROSE)
## Loaded ROSE 0.0-4
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(rattle)
library(rpart.plot)
library(RColorBrewer)

LOADING/READ FILE

df = readr::read_csv("caravan-insurance-challenge.csv", show_col_types = FALSE)

INTITUTIONAL EXPLORATION THROUGH VISUALIZATION

dim(df)
## [1] 9822   87

Total number of observations are 9822 Total number of variables are 86 ignoring the 1st column which is meaningless for our data

DESCRIPTIVE ANALYSIS

skim(df)
Data summary
Name df
Number of rows 9822
Number of columns 87
_______________________
Column type frequency:
character 1
numeric 86
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ORIGIN 0 1 4 5 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
MOSTYPE 0 1 24.25 12.92 1 10 30 35 41 ▆▃▃▇▇
MAANTHUI 0 1 1.11 0.41 1 1 1 1 10 ▇▁▁▁▁
MGEMOMV 0 1 2.68 0.78 1 2 3 3 6 ▇▇▂▁▁
MGEMLEEF 0 1 3.00 0.80 1 2 3 3 6 ▃▇▃▁▁
MOSHOOFD 0 1 5.78 2.87 1 3 7 8 10 ▃▃▃▇▃
MGODRK 0 1 0.70 1.02 0 0 0 1 9 ▇▂▁▁▁
MGODPR 0 1 4.64 1.72 0 4 5 6 9 ▁▂▇▃▁
MGODOV 0 1 1.05 1.01 0 0 1 2 5 ▇▃▁▁▁
MGODGE 0 1 3.26 1.61 0 2 3 4 9 ▂▇▇▁▁
MRELGE 0 1 6.19 1.90 0 5 6 7 9 ▁▁▃▇▃
MRELSA 0 1 0.87 0.96 0 0 1 1 7 ▇▂▁▁▁
MRELOV 0 1 2.29 1.71 0 1 2 3 9 ▅▇▂▁▁
MFALLEEN 0 1 1.89 1.78 0 0 2 3 9 ▇▆▂▁▁
MFGEKIND 0 1 3.24 1.61 0 2 3 4 9 ▂▇▆▁▁
MFWEKIND 0 1 4.30 1.98 0 3 4 6 9 ▂▆▇▃▁
MOPLHOOG 0 1 1.48 1.65 0 0 1 2 9 ▇▃▁▁▁
MOPLMIDD 0 1 3.31 1.72 0 2 3 4 9 ▃▇▇▂▁
MOPLLAAG 0 1 4.59 2.28 0 3 5 6 9 ▂▆▇▆▂
MBERHOOG 0 1 1.90 1.81 0 0 2 3 9 ▇▆▂▁▁
MBERZELF 0 1 0.40 0.79 0 0 0 1 5 ▇▁▁▁▁
MBERBOER 0 1 0.55 1.11 0 0 0 1 9 ▇▁▁▁▁
MBERMIDD 0 1 2.88 1.85 0 2 3 4 9 ▃▇▃▁▁
MBERARBG 0 1 2.23 1.75 0 1 2 3 9 ▆▇▃▁▁
MBERARBO 0 1 2.29 1.68 0 1 2 3 9 ▆▇▃▁▁
MSKA 0 1 1.65 1.74 0 0 1 2 9 ▇▅▁▁▁
MSKB1 0 1 1.60 1.32 0 1 2 2 9 ▇▇▁▁▁
MSKB2 0 1 2.20 1.53 0 1 2 3 9 ▅▇▃▁▁
MSKC 0 1 3.74 1.94 0 2 4 5 9 ▂▇▇▂▁
MSKD 0 1 1.07 1.30 0 0 1 2 9 ▇▂▁▁▁
MHHUUR 0 1 4.19 3.09 0 2 4 7 9 ▇▇▆▅▇
MHKOOP 0 1 4.82 3.09 0 2 5 7 9 ▇▅▆▇▇
MAUT1 0 1 6.02 1.54 0 5 6 7 9 ▁▁▅▇▂
MAUT2 0 1 1.34 1.21 0 0 1 2 9 ▇▅▁▁▁
MAUT0 0 1 1.96 1.60 0 0 2 3 9 ▇▇▂▁▁
MZFONDS 0 1 6.25 2.00 0 5 7 8 9 ▁▂▅▇▅
MZPART 0 1 2.75 2.00 0 1 2 4 9 ▅▇▅▂▁
MINKM30 0 1 2.58 2.07 0 1 2 4 9 ▇▇▅▂▁
MINK3045 0 1 3.51 1.87 0 2 4 5 9 ▃▇▇▂▁
MINK4575 0 1 2.74 1.95 0 1 3 4 9 ▅▇▅▁▁
MINK7512 0 1 0.81 1.17 0 0 0 1 9 ▇▂▁▁▁
MINK123M 0 1 0.21 0.56 0 0 0 0 9 ▇▁▁▁▁
MINKGEM 0 1 3.80 1.33 0 3 4 4 9 ▁▇▇▂▁
MKOOPKLA 0 1 4.26 2.00 1 3 4 6 8 ▅▇▇▅▅
PWAPART 0 1 0.76 0.96 0 0 0 2 3 ▇▁▁▅▁
PWABEDR 0 1 0.04 0.36 0 0 0 0 6 ▇▁▁▁▁
PWALAND 0 1 0.07 0.51 0 0 0 0 4 ▇▁▁▁▁
PPERSAUT 0 1 2.96 2.92 0 0 5 6 9 ▇▁▂▆▁
PBESAUT 0 1 0.05 0.57 0 0 0 0 7 ▇▁▁▁▁
PMOTSCO 0 1 0.17 0.89 0 0 0 0 7 ▇▁▁▁▁
PVRAAUT 0 1 0.01 0.24 0 0 0 0 9 ▇▁▁▁▁
PAANHANG 0 1 0.02 0.20 0 0 0 0 5 ▇▁▁▁▁
PTRACTOR 0 1 0.09 0.60 0 0 0 0 7 ▇▁▁▁▁
PWERKT 0 1 0.01 0.22 0 0 0 0 6 ▇▁▁▁▁
PBROM 0 1 0.22 0.81 0 0 0 0 6 ▇▁▁▁▁
PLEVEN 0 1 0.20 0.91 0 0 0 0 9 ▇▁▁▁▁
PPERSONG 0 1 0.01 0.19 0 0 0 0 6 ▇▁▁▁▁
PGEZONG 0 1 0.02 0.21 0 0 0 0 3 ▇▁▁▁▁
PWAOREG 0 1 0.02 0.38 0 0 0 0 7 ▇▁▁▁▁
PBRAND 0 1 1.85 1.88 0 0 2 4 8 ▇▅▃▁▁
PZEILPL 0 1 0.00 0.06 0 0 0 0 3 ▇▁▁▁▁
PPLEZIER 0 1 0.02 0.24 0 0 0 0 6 ▇▁▁▁▁
PFIETS 0 1 0.03 0.16 0 0 0 0 1 ▇▁▁▁▁
PINBOED 0 1 0.02 0.21 0 0 0 0 6 ▇▁▁▁▁
PBYSTAND 0 1 0.05 0.40 0 0 0 0 5 ▇▁▁▁▁
AWAPART 0 1 0.40 0.49 0 0 0 1 2 ▇▁▅▁▁
AWABEDR 0 1 0.01 0.13 0 0 0 0 5 ▇▁▁▁▁
AWALAND 0 1 0.02 0.14 0 0 0 0 1 ▇▁▁▁▁
APERSAUT 0 1 0.56 0.61 0 0 1 1 12 ▇▁▁▁▁
ABESAUT 0 1 0.01 0.13 0 0 0 0 5 ▇▁▁▁▁
AMOTSCO 0 1 0.04 0.22 0 0 0 0 8 ▇▁▁▁▁
AVRAAUT 0 1 0.00 0.07 0 0 0 0 4 ▇▁▁▁▁
AAANHANG 0 1 0.01 0.12 0 0 0 0 3 ▇▁▁▁▁
ATRACTOR 0 1 0.03 0.25 0 0 0 0 6 ▇▁▁▁▁
AWERKT 0 1 0.01 0.11 0 0 0 0 6 ▇▁▁▁▁
ABROM 0 1 0.07 0.27 0 0 0 0 3 ▇▁▁▁▁
ALEVEN 0 1 0.08 0.38 0 0 0 0 8 ▇▁▁▁▁
APERSONG 0 1 0.00 0.07 0 0 0 0 1 ▇▁▁▁▁
AGEZONG 0 1 0.01 0.09 0 0 0 0 1 ▇▁▁▁▁
AWAOREG 0 1 0.00 0.07 0 0 0 0 2 ▇▁▁▁▁
ABRAND 0 1 0.57 0.56 0 0 1 1 7 ▇▁▁▁▁
AZEILPL 0 1 0.00 0.03 0 0 0 0 1 ▇▁▁▁▁
APLEZIER 0 1 0.01 0.08 0 0 0 0 2 ▇▁▁▁▁
AFIETS 0 1 0.03 0.21 0 0 0 0 4 ▇▁▁▁▁
AINBOED 0 1 0.01 0.09 0 0 0 0 2 ▇▁▁▁▁
ABYSTAND 0 1 0.01 0.12 0 0 0 0 2 ▇▁▁▁▁
CARAVAN 0 1 0.06 0.24 0 0 0 0 1 ▇▁▁▁▁

There are no missing values in our data or NANs [Not A Number]

PLOTTING GRAPHS

Uninsured =length(which(df$CARAVAN == 0))
Insured = length(which(df$CARAVAN == 1))

We use this function to find out the total count of rows where customers were either insured = 1 or uninsured = 0 for Caravan Insurance Only

frame = data.frame(Policy_Status = factor(c("Uninsured","Insured"), levels=c("Uninsured","Insured")), Count = c(Uninsured,Insured))
plot_ly(frame, x = ~Policy_Status, y = ~Count, type = "bar", color = frame$Policy_Status, colors = c("Purple", "Gold")) %>% 
  layout(title = "<b>Insuraned vs Uninsured<b>", legend = list(title = list(text ='<b> Policy Status </b>')))

From the below graph we mapped those who were insured for Caravan insurance against those who were not. We see that the count of Uninsured = 9236 vs Insured = 586 This led us to think about the factors why such a large amount of customers are not Caravan insured vs those who were Hence, our research question, to figure out the characteristics of those who have Caravan Insurance

CUSTOMER MAIN TYPE

ggplot(df,aes(factor(df$MOSHOOFD))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    labs(x="Customer Main type") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Caravan Policy based on Customer Main Type") +
    theme(plot.title = element_text(hjust = 0.5))

df$maintype = df$MOSHOOFD

nrow(df[df$maintype == 1 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 2 & df$CARAVAN == 1,])
## [1] 103
nrow(df[df$maintype == 3 & df$CARAVAN == 1,])
## [1] 109
nrow(df[df$maintype == 4 & df$CARAVAN == 1,])
## [1] 0
nrow(df[df$maintype == 5 & df$CARAVAN == 1,])
## [1] 18
nrow(df[df$maintype == 6 & df$CARAVAN == 1,])
## [1] 9
nrow(df[df$maintype == 7 & df$CARAVAN == 1,])
## [1] 35
nrow(df[df$maintype == 8 & df$CARAVAN == 1,])
## [1] 151
nrow(df[df$maintype == 9 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 10 & df$CARAVAN == 1,])
## [1] 11

Here we wanted to see the which customer main type has the highest frequency/count of buying the insurance. Based on results, we see that there are atleast 4 main customer categories that buy insurance. However, for our ease and understanding purposes we will only consider the top 2. This brings us to select, category number 8 and 3, where 8 = Family with grown ups and 3 = Driven Growths

CUSTOMER SUB TYPE

ggplot(df,aes(factor(df$MOSTYPE))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    labs(x="Customer Sub type") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Policy Bought based on Customer sub Type") +
    theme(plot.title = element_text(hjust = 0.5))

df$subtype = df$MOSTYPE
nrow(df[df$subtype == 33 & df$CARAVAN == 1,])
## [1] 80
nrow(df[df$subtype == 8 & df$CARAVAN == 1,])
## [1] 72

category 33 and 8 purchased more policies. Based on our main category which compromises of various sub-categories, we can see that sub-categories number 33 and 8 are porne to buying insurance. These should be considered as the characteristics/attributes of the types of customers that exist in the main customer category. hence, we could say that, those who are middle class and those who are low class but have large families have a higher chance of getting the insurance

AGE

ggplot(df,aes(factor(df$MGEMLEEF))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Age Group") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Policy bought on age group") +
    theme(plot.title = element_text(hjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

df$age = df$MGEMLEEF
nrow(df[df$age == 1 & df$CARAVAN == 1,])
## [1] 1
nrow(df[df$age == 2 & df$CARAVAN == 1,])
## [1] 156
nrow(df[df$age == 3 & df$CARAVAN == 1,])
## [1] 303
nrow(df[df$age == 4 & df$CARAVAN == 1,])
## [1] 105
nrow(df[df$age == 5 & df$CARAVAN == 1,])
## [1] 20
nrow(df[df$age == 6 & df$CARAVAN == 1,])
## [1] 1

Here we have explored to see what is the age range of the customers that buy the insurance. Based on our analysis we see that customers who are between the ages 40-50 are prone to buying insurance compared with others. Hence, we could say that it is among the many characteristics of the main customer group [3,8]

NO. OF HOUSES

ggplot(df,aes(factor(df$MAANTHUI))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Number of houses") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Number of houses customer has who bought insurance") +
    theme(plot.title = element_text(hjust = 0.5))

df$noofhouses = df$MAANTHUI
nrow(df[df$noofhouses == 1 & df$CARAVAN == 1,])
## [1] 526
nrow(df[df$noofhouses == 2 & df$CARAVAN == 1,])
## [1] 59
nrow(df[df$noofhouses == 3 & df$CARAVAN == 1,])
## [1] 1

Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.

NO. OF HOUSEHOLD

ggplot(df,aes(factor(df$MGEMOMV))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Number of house hold") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Number of house hold") +
    theme(plot.title = element_text(hjust = 0.5))

df$noofhousehold = df$MGEMOMV
nrow(df[df$noofhousehold == 1 & df$CARAVAN == 1,])
## [1] 11
nrow(df[df$noofhousehold == 2 & df$CARAVAN == 1,])
## [1] 195
nrow(df[df$noofhousehold == 3 & df$CARAVAN == 1,])
## [1] 275
df$hasThreeHouseHold = ifelse(df$noofhousehold == 3, 1, 0)

Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.

Characteristics we found so far

1. Customer having 3 house hold

2. Customer have one house, Age of customer is between 40 to 50

3. Customer are Driven Growers

4. Customer belongs to Lower class large families

Mod 1 - Correlation Analysis

corrplot(cor(df[, c("subtype","maintype", "age", "noofhouses", "noofhousehold", "CARAVAN")]), method = "number")

From the correlation matrix we have below, we have some interesting insights. The reason to run the correlation matrix was to figure out variables of interest which might cause either over-fitting or under-fitting. # Here we can see that the variables of interest are positively correlated. One exception to this is the correlation between age and number of households. We see that there is a negative correlation between it. Which actually makes sense because, the greater the age, the no of households will decrease.

Which varaible to select when they are highly postive correlated?

cor(df$subtype, df$CARAVAN)
## [1] -0.06074174
cor(df$maintype, df$CARAVAN)
## [1] -0.05930648

we will choose which have high corelation with response variable in this case both are weak correlated doesn’t matter what we really choose. we will choose maintype

Categorizing Data

Number of Houses

table(df$MAANTHUI)
## 
##    1    2    3    4    5    6    7    8   10 
## 8915  821   64    4    3    3    8    2    2
df$MAANTHUI = replace(df$MAANTHUI, df$MAANTHUI > 2, 2)
df$OneHouse = ifelse(df$MAANTHUI ==1, 1, 0)
df$moreThanTwoHouse = ifelse(df$MAANTHUI > 2, 1, 0)

by looking frequencies of houses we categorized houses into dummies. As we already saw from the above bar graph that customers having at least 1 house are likely to buy insurance, we thought it would be a good idea to create dummies in a binary way instead of having many different ranges. That is why we have done this step.

Grouping sub customer to meanful customer type.

df$averageFamily = ifelse(df$MOSTYPE %in% c(12,11,9,10,13), 1, 0)
df$loners = ifelse(df$MOSTYPE %in% c(17,15,18,16,19), 1, 0)
df$conservativeFamilies = ifelse(df$MOSTYPE %in% c(39,38), 1, 0)
df$crusingSeniors = ifelse(df$MOSTYPE %in% c(26,25,28,27), 1, 0)
df$drivenGrowers = ifelse(df$MOSTYPE %in% c(6,7,8), 1, 0)
df$grownups = ifelse(df$MOSTYPE %in% c(33,34,35,36,37), 1, 0)
df$framers = ifelse(df$MOSTYPE %in% c(40,41), 1, 0)
df$livingWell = ifelse(df$MOSTYPE %in% c(20,21,22,23,24), 1, 0)
df$retired = ifelse(df$MOSTYPE %in% c(29,30,31,32), 1, 0)
df$successful = ifelse(df$MOSTYPE %in% c(1,2,3,4,5), 1, 0)

dat <- data.frame(
  Categorized_Customers = factor(c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful"), levels=c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful")),
  Count = c( sum(df$averageFamily), sum(df$loners), sum(df$conservativeFamilies), sum(df$crusingSeniors), sum(df$drivenGrowers), sum(df$grownups), sum(df$framers), sum(df$livingWell), sum(df$retired), sum(df$successful) )
)

plot_ly(dat, x = ~Categorized_Customers, y = ~Count, type = 'bar', color = dat$Categorized_Customers) %>% 
  layout(title = "<b>Customer Types<b>", legend = list(title = list(text ='<b> Types </b>')))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

We have 10 customer main types and 41 customer sub-types. A leveled approach was to consider merging them into one bucket based on sub-category. Since one of our goal is to find the characteristics of customers, we believe that the customer sub-type are the characteristics. However, when we look at customer main type alone, we cannot figure out the characteristics of people in this group. Hence, we merged sub-categories with main categories.

An advantage of doing this is that when we look at customer main type, we would automatically know what are the characteristics of this group - explained by the sub types within this group

After having done so, we constructed a bar chart and observed the following:

  1. Grown-ups are the most popular
  2. Followed by Average Family
  3. Followed by Conservative Families

Income Conversion

# Converting 30k income into value
df$MINKM30_c = ifelse(df$MINKM30 == 1, 0.05 * 30000, df$MINKM30)
df$MINKM30_c = ifelse(df$MINKM30_c == 2, 0.17 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 3, 0.3 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 4, 0.43 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 5, 0.56 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 6, 0.69 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 7, 0.82 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 8, 0.94 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 9, 1 * 30000, df$MINKM30_c)


# Converting 45k income into value
df$MINK3045_c = ifelse(df$MINK3045 == 1, 0.05 * 45000, df$MINK3045)
df$MINK3045_c = ifelse(df$MINK3045_c == 2, 0.17 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 3, 0.3 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 4, 0.43 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 5, 0.56 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 6, 0.69 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 7, 0.82 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 8, 0.94 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 9, 1 * 45000, df$MINK3045_c)

# Converting 70k income into value
df$MINK4575_c = ifelse(df$MINK4575 == 1, 0.05 * 75000, df$MINK4575)
df$MINK4575_c = ifelse(df$MINK4575_c == 2, 0.17 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 3, 0.3 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 4, 0.43 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 5, 0.56 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 6, 0.69 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 7, 0.82 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 8, 0.94 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 9, 1 * 75000, df$MINK4575_c)

# Converting 122k income into value
df$MINK7512_c = ifelse(df$MINK7512 == 1, 0.05 * 122000, df$MINK7512)
df$MINK7512_c = ifelse(df$MINK7512_c == 2, 0.17 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 3, 0.3 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 4, 0.43 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 5, 0.56 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 6, 0.69 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 7, 0.82 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 8, 0.94 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 9, 1 * 122000, df$MINK7512_c)

# Converting 123k income into value
df$MINK123M_c = ifelse(df$MINK123M == 1, 0.05 * 123000, df$MINK123M)
df$MINK123M_c = ifelse(df$MINK123M_c == 2, 0.17 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 3, 0.3 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 4, 0.43 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 5, 0.56 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 6, 0.69 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 7, 0.82 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 8, 0.94 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 9, 1 * 123000, df$MINK123M_c)

# Average income
df$MINKGEM_c = (df$MINK123M_c + df$MINK7512_c + df$MINK4575_c + df$MINK3045_c + df$MINKM30_c)/5
plot_ly(x = ~df$MINKGEM_c, type = "histogram", color = df$MINKGEM_c, colors = c("gold", "blue", "green", "pink", "brown")) %>%
  layout(title = "<b>Income Levels<b>")
## Warning: textfont.color doesn't (yet) support data arrays
## Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning Inf
## Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
## -Inf
## Warning: textfont.color doesn't (yet) support data arrays

Converting age into numerical.

df$MGEMLEEF_c = ifelse(df$MGEMLEEF == 1, 25, df$MGEMLEEF)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 2, 35, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 3, 45, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 55, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 5, 65, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 75, df$MGEMLEEF_c)
# MOSTYPE : customer subtype
# MFWEKIND: Household with children
# MOPLLAAG: Lower level education
# MHHUUR :  Rented house
# MHKOOP :  Home owners
# MINKM30:  Income < 30.000 low income
# MINK7512: Income 75-122.000 high income
# MKOOPKLA: Purchasing power class
# PPERSAUT: Contribution car policies
# CARAVAN:  Number of mobile home policies 0 - 1

Outliers

plot_ly(x = ~df$MINKM30_c, y = ~df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed

After having converted the categorical variable for income to numerical, here we take income less than 3000 against customer sub-categories and draw a box plot. From the below box plot, we can see that as soon as income crosses 10K, we begint to see a few outliers for certain sub-categories. Another important thing to note is that when income jumps above 20K, the distance of outliers starts to increase. We could say that we are seeing extreme outliers in income levels ranging from 25K to 30K.

Now the question is if we should keep outliers in our analysis, whether mild or extreme or delete both of them and then proceed?

plot_ly(y = ~df$MOSTYPE, x = ~df$MINK3045_c, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
plot_ly(x = df$MINK4575_c, y = df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed

Class LM balance problem

prop.table(table(df$CARAVAN))
## 
##          0          1 
## 0.94033802 0.05966198

When drew a frequency table, we saw that we have a class imbalance problem as 94% of the records in the CARAVAN column are 0s while only 5% of them are 1s. This is a problem which needs to be solved first before we begin further

barplot(prop.table(table(df$CARAVAN)), col = rainbow(2), ylim = c(0,1), main = "Class Distribution")

Here we have shown how the distribution is happening through a bar chart

DATA PARTITION

# training data
df_train = (df[df$ORIGIN == "train",])
df_train = (df_train[,-1])
table(df_train$CARAVAN)
## 
##    0    1 
## 5474  348
#testing data
df_test = (df[df$ORIGIN == "test",])
df_test = (df_test[,-1])
nrow(df_test)
## [1] 4000
table(df_test$CARAVAN)
## 
##    0    1 
## 3762  238

Solving class imbalance problem

over_train = ovun.sample(CARAVAN ~ ., data =df_train, method = "over", N =10948)$data
table(over_train$CARAVAN)
## 
##    0    1 
## 5474 5474
over_test = ovun.sample(CARAVAN ~ ., data =df_test, method = "over", N =nrow(df_test))$data
table(over_test$CARAVAN)
## 
##    0    1 
## 3762  238

we are not fixing sampling problem in test data, we did this because we were having error while doing prediction

Convert train data set into factor

for(i in 1:ncol(over_train)){
over_train[,i] <- as.factor(over_train[,i])
}

In the followng code, since our entire data is categorical we have converted all of it into factors, for training data

Convert test data set into factor

for(i in 1:ncol(over_test)){
over_test[,i] <- as.factor(over_test[,i])
}

In the followng code, since our entire data is categorical we have converted all of it into factors, for test data as well

over_test$MINKGEM_c = as.numeric(over_test$MINKGEM_c)
over_train$MINKGEM_c = as.numeric(over_train$MINKGEM_c)

over_test$MGEMLEEF_c = as.numeric(over_test$MGEMLEEF_c)
over_train$MGEMLEEF_c = as.numeric(over_train$MGEMLEEF_c)

We realized that income should not be as a factor since we converted so we just reconverted back to numeric

Draw Confusion matrix function

drewSummary = function(model) {
  summary(model)
}
drewMatrix = function(model, test_data) {
   predicted = predict(model, test_data, type = "response")
   predictedClass = ifelse(predicted>=0.5, 1, 0)
   confusionMatrix(as.factor(predictedClass), as.factor(test_data$CARAVAN), positive = "1")
}
drewAnova = function(model1, model2){
    anova(model1, model2, test = 'Chisq')
}
drewROC = function(model){
   predicted = predict(model, over_test, type = "response")
   predictedClass = ifelse(predicted>=0.5, 1, 0)
   r = roc(over_test$CARAVAN, predictedClass)
   plot.roc(r)
}
getRMSE = function(predictedClass){
  accuracy(predictedClass, as.numeric(over_test$CARAVAN))[2]
}

Correlation of Model 1 predictors

new_data = over_train
new_data$MOSHOOFD = as.numeric(new_data$MOSHOOFD)
new_data$MGEMOMV = as.numeric(new_data$MGEMOMV)
new_data$MINKGEM = as.numeric(new_data$MINKGEM)
new_data$MGEMLEEF = as.numeric(new_data$MGEMLEEF)
new_data$CARAVAN = as.numeric(new_data$CARAVAN)
new_data$OneHouse = as.numeric(new_data$OneHouse)

corrplot(cor(subset(new_data , select = c("MOSHOOFD", "MGEMOMV", "OneHouse", "MINKGEM", "MGEMLEEF", "CARAVAN"))), method = "number", type = "upper")

We have performed a correlation matrix to determine the factors for model one. The variables are those which we have already used above. The correlation matrix tells us which variables are important to least important

Model 1

set.seed(123)

logit.reg = glm(CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c+MGEMLEEF_c, data = over_train, family = binomial (link = "logit"))

logit.reg$xlevels[["MGEMOMV"]] <- union(logit.reg$xlevels[["MGEMOMV"]], levels(over_test$MGEMOMV))

drewSummary(logit.reg)
## 
## Call:
## glm(formula = CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c + 
##     MGEMLEEF_c, family = binomial(link = "logit"), data = over_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7524  -1.1471   0.3593   1.1508   1.8673  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.623655   0.204826  -3.045  0.00233 ** 
## MOSHOOFD2     0.390835   0.085280   4.583 4.58e-06 ***
## MOSHOOFD3    -0.310404   0.081514  -3.808  0.00014 ***
## MOSHOOFD4   -14.597185 120.532051  -0.121  0.90361    
## MOSHOOFD5    -0.941544   0.104585  -9.003  < 2e-16 ***
## MOSHOOFD6    -1.246472   0.157307  -7.924 2.30e-15 ***
## MOSHOOFD7    -0.729989   0.095261  -7.663 1.82e-14 ***
## MOSHOOFD8    -0.340075   0.071590  -4.750 2.03e-06 ***
## MOSHOOFD9    -0.142648   0.085464  -1.669  0.09510 .  
## MOSHOOFD10   -1.619452   0.144267 -11.225  < 2e-16 ***
## MGEMOMV2      0.353052   0.129042   2.736  0.00622 ** 
## MGEMOMV3      0.303507   0.131857   2.302  0.02135 *  
## MGEMOMV4      0.346560   0.142779   2.427  0.01521 *  
## MGEMOMV5      0.183444   0.225311   0.814  0.41554    
## OneHouse1     0.123074   0.070856   1.737  0.08239 .  
## MINKGEM_c     0.002208   0.000263   8.394  < 2e-16 ***
## MGEMLEEF_c    0.057575   0.030824   1.868  0.06178 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 14363  on 10931  degrees of freedom
## AIC: 14397
## 
## Number of Fisher Scoring iterations: 13
drewMatrix(logit.reg, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  315   10
##          1 3447  228
##                                           
##                Accuracy : 0.1358          
##                  95% CI : (0.1253, 0.1468)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0054          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.95798         
##             Specificity : 0.08373         
##          Pos Pred Value : 0.06204         
##          Neg Pred Value : 0.96923         
##              Prevalence : 0.05950         
##          Detection Rate : 0.05700         
##    Detection Prevalence : 0.91875         
##       Balanced Accuracy : 0.52086         
##                                           
##        'Positive' Class : 1               
## 
drewROC(logit.reg)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# getRMSE(logit.reg)
# drewAnova(logit.reg)

-we did regression with/out house we see a minimal affect i.e with house model is predicting 60 true positives without it’s predicting 65 and it’s not significant, so we decided not to include this variable

-difference in deviance = Null deviance (15177) - 14398 = 872

-From the below confusion matrix for model 1 we see that the accuracy of the model is 11% but the sensitivity is 98% while specificity is 57%. There could be couple of cases to either consider or discard the model. If specificity is the goal then this model is a good fit otherwise, it is not.

train_2 = over_train
train_2 = subset(over_train, select = -c(maintype,subtype,age,noofhouses,noofhousehold,hasThreeHouseHold,OneHouse,moreThanTwoHouse,averageFamily,loners,conservativeFamilies,crusingSeniors,drivenGrowers,grownups,framers,livingWell,retired,successful,MINKM30_c,MINK3045_c,MINK4575_c,MINK7512_c,MINK123M_c,MINKGEM_c,MGEMLEEF_c,MOSHOOFD,MGEMOMV,MAANTHUI,MINKGEM,MGEMLEEF) )
length(train_2)
## [1] 81

Dropping 5 variables that we have used in first regression. and dummies we have created earlier.

new_df  = train_2
for(i in 1:ncol(new_df)){
new_df[,i] <- as.integer(new_df[,i])
}

Model 2 | Building model using correlation Analysis

Now we build model number 2 which will also use correlation matrix but this time we want to remove variables that are highly correlated and get those variables which are less correlated hence have a good impact on the response variable and its predictors. We are removing such variables step by step and explaining why are we doing that.

Pre processing

zv = apply(new_df, 2, function(x) length(unique(x)) == 1)
dfr = new_df[, !zv]
n=length(colnames(dfr))
correlationMatrix = cor(dfr[,1:n],use="complete.obs")
summary(correlationMatrix[upper.tri(correlationMatrix)])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.999760 -0.023458 -0.001316  0.010568  0.036060  0.984158

After removing our suspected predictors we still have strong positive correlation with 1% and strong negative corelation with 99.9%, we need to find which of them are highly correlated

high = findCorrelation(correlationMatrix, cutoff = 0.75, names = TRUE)
high
##  [1] "MZPART"   "MHHUUR"   "MSKA"     "MRELGE"   "PBRAND"   "PWAPART" 
##  [7] "APERSAUT" "PTRACTOR" "MGODGE"   "AWABEDR"  "AWALAND"  "AAANHANG"
## [13] "PBESAUT"  "PZEILPL"  "APLEZIER" "PBYSTAND" "AWERKT"   "PLEVEN"  
## [19] "PGEZONG"  "AWAOREG"  "PBROM"    "PFIETS"   "PINBOED"  "PMOTSCO" 
## [25] "APERSONG" "AVRAAUT"
length(high)
## [1] 26

there are 26 variables which are correlated with each other, before dropping them we need to see how they are correlated with response variable. The reason we why we have chose cut-off value of .75 is because it gives us variables that are strongly correlated with each other

target_cor_df = data.frame(CARAVAN = cor(df_train[,sort(high)], df_train[, "CARAVAN"]))



cor_df = target_cor_df[order(target_cor_df$CARAVAN,decreasing = T),,drop=F]

excludedVariables = row.names(cor_df[cor_df$CARAVAN < 0.1, ,drop=F])
excludedVariables
##  [1] "PWAPART"  "PBRAND"   "MRELGE"   "MSKA"     "PBYSTAND" "MZPART"  
##  [7] "PGEZONG"  "PFIETS"   "AWAOREG"  "PLEVEN"   "PZEILPL"  "AAANHANG"
## [13] "PMOTSCO"  "PINBOED"  "AWABEDR"  "PBESAUT"  "APERSONG" "AVRAAUT" 
## [19] "PTRACTOR" "AWERKT"   "AWALAND"  "MGODGE"   "PBROM"    "MHHUUR"
paste0("excluding total variables from main data set ", length(excludedVariables))
## [1] "excluding total variables from main data set 24"

There are 24 variables which are less correlated with response variable and having correlation coefficient less than 0.1 so we will exclude them. These 24 variables are from 26 variables from the above

train_2 = data.frame(train_2[, !colnames(train_2) %in% excludedVariables])
names(train_2)
##  [1] "MOSTYPE"  "MGODRK"   "MGODPR"   "MGODOV"   "MRELSA"   "MRELOV"  
##  [7] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [13] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [19] "MSKB1"    "MSKB2"    "MSKC"     "MSKD"     "MHKOOP"   "MAUT1"   
## [25] "MAUT2"    "MAUT0"    "MZFONDS"  "MINKM30"  "MINK3045" "MINK4575"
## [31] "MINK7512" "MINK123M" "MKOOPKLA" "PWABEDR"  "PWALAND"  "PPERSAUT"
## [37] "PVRAAUT"  "PAANHANG" "PWERKT"   "PPERSONG" "PWAOREG"  "PPLEZIER"
## [43] "AWAPART"  "APERSAUT" "ABESAUT"  "AMOTSCO"  "ATRACTOR" "ABROM"   
## [49] "ALEVEN"   "AGEZONG"  "ABRAND"   "AZEILPL"  "APLEZIER" "AFIETS"  
## [55] "AINBOED"  "ABYSTAND" "CARAVAN"
length(train_2)
## [1] 57
# corrplot(cor(train_3), method = "number")

# 24 + 5 (predictors in mode l) = 29

# around 29 variables have been excluded from set so far next step would be to find good predictors which are not highly correlated each other and are significant.

# we have reduce dimension from 86 to 57

24 + 5 (predictors in mode l) = 29

around 29 variables have been excluded from set so far next step would be to find good predictors which are not highly correlated each other and are significant.

we have reduce dimension from 86 to 57

# corelation between no of car policy and carvan
cor(df_train$APERSAUT, df_train$CARAVAN)
## [1] 0.1442105
# # corelation between contribution car policies and carvan
cor(df_train$PPERSAUT, df_train$CARAVAN)
## [1] 0.1509097
# # corelation between  Purchasing power class and carvan
cor(df_train$MKOOPKLA, df_train$CARAVAN)
## [1] 0.09593826
new_train_2 = train_2
for(i in 1:ncol(new_train_2)){
new_train_2[,i] <- as.numeric(new_train_2[,i])
}
cor_response = data.frame("ind_var" = colnames(new_train_2), "dep_var" = "CARAVAN", "cor_coeff" = 0, "p_values" = 0)

for (i in colnames(new_train_2)){
    cor_test <- cor.test(new_train_2[,i], new_train_2[,"CARAVAN"])
    cor_response[cor_response$ind_var == i, "correlation_coefficient"] = cor_test$estimate
    cor_response[cor_response$ind_var == i, "p_values"] = cor_test$p.value
}
cor_response[order(cor_response$cor_coeff, decreasing = T),]
##     ind_var dep_var cor_coeff      p_values correlation_coefficient
## 1   MOSTYPE CARAVAN         0  2.265491e-40            -0.126629109
## 2    MGODRK CARAVAN         0  1.415021e-01             0.014052163
## 3    MGODPR CARAVAN         0  1.257828e-17             0.081563718
## 4    MGODOV CARAVAN         0  2.772500e-01             0.010385020
## 5    MRELSA CARAVAN         0  2.060545e-15            -0.075761616
## 6    MRELOV CARAVAN         0  1.180399e-50            -0.142344374
## 7  MFALLEEN CARAVAN         0  5.309752e-40            -0.126026326
## 8  MFGEKIND CARAVAN         0  2.514008e-02             0.021400986
## 9  MFWEKIND CARAVAN         0  4.201894e-13             0.069204271
## 10 MOPLHOOG CARAVAN         0  5.453918e-69             0.166621631
## 11 MOPLMIDD CARAVAN         0  2.208517e-22             0.092818069
## 12 MOPLLAAG CARAVAN         0  6.059450e-91            -0.191512831
## 13 MBERHOOG CARAVAN         0  7.664487e-45             0.133694736
## 14 MBERZELF CARAVAN         0  3.100937e-12             0.066576997
## 15 MBERBOER CARAVAN         0  1.296953e-39            -0.125391135
## 16 MBERMIDD CARAVAN         0  3.868878e-17             0.080320641
## 17 MBERARBG CARAVAN         0  8.321327e-19            -0.084493667
## 18 MBERARBO CARAVAN         0  6.533712e-35            -0.117411705
## 19    MSKB1 CARAVAN         0  8.844832e-09             0.054933370
## 20    MSKB2 CARAVAN         0  1.736747e-01             0.013003473
## 21     MSKC CARAVAN         0  2.606574e-20            -0.088090245
## 22     MSKD CARAVAN         0  3.109772e-58            -0.152867441
## 23   MHKOOP CARAVAN         0  3.921174e-80             0.179713174
## 24    MAUT1 CARAVAN         0  8.114368e-65             0.161431525
## 25    MAUT2 CARAVAN         0  3.139517e-02             0.020567625
## 26    MAUT0 CARAVAN         0  3.099707e-81            -0.180953606
## 27  MZFONDS CARAVAN         0  4.201169e-38            -0.122885514
## 28  MINKM30 CARAVAN         0  2.684951e-95            -0.196050678
## 29 MINK3045 CARAVAN         0  6.678538e-01            -0.004101420
## 30 MINK4575 CARAVAN         0  1.444118e-43             0.131719061
## 31 MINK7512 CARAVAN         0  5.855925e-42             0.129183471
## 32 MINK123M CARAVAN         0  5.744262e-01            -0.005367394
## 33 MKOOPKLA CARAVAN         0  2.288708e-91             0.191958627
## 34  PWABEDR CARAVAN         0  1.000000e+00             0.000000000
## 35  PWALAND CARAVAN         0  2.092667e-07            -0.049585947
## 36 PPERSAUT CARAVAN         0 6.282637e-306             0.346248808
## 37  PVRAAUT CARAVAN         0  3.485079e-03            -0.027917884
## 38 PAANHANG CARAVAN         0  1.312035e-02             0.023705979
## 39   PWERKT CARAVAN         0  1.875280e-05            -0.040885355
## 40 PPERSONG CARAVAN         0  6.483585e-03            -0.026015928
## 41  PWAOREG CARAVAN         0  4.617519e-06             0.043770091
## 42 PPLEZIER CARAVAN         0  2.923919e-33             0.114476321
## 43  AWAPART CARAVAN         0  4.983805e-82             0.181841434
## 44 APERSAUT CARAVAN         0 5.451683e-238             0.307227802
## 45  ABESAUT CARAVAN         0  6.981609e-02            -0.017328933
## 46  AMOTSCO CARAVAN         0  3.649931e-02             0.019987546
## 47 ATRACTOR CARAVAN         0  8.764842e-06            -0.042473319
## 48    ABROM CARAVAN         0  1.431323e-33            -0.115033748
## 49   ALEVEN CARAVAN         0  1.359462e-12             0.067672943
## 50  AGEZONG CARAVAN         0  5.895091e-08             0.051790072
## 51   ABRAND CARAVAN         0  7.264731e-45             0.133730502
## 52  AZEILPL CARAVAN         0  1.600743e-03             0.030157056
## 53 APLEZIER CARAVAN         0  1.617283e-40             0.126866825
## 54   AFIETS CARAVAN         0  1.934488e-09             0.057333528
## 55  AINBOED CARAVAN         0  7.836628e-03             0.025411780
## 56 ABYSTAND CARAVAN         0  8.302969e-26             0.100145723
## 57  CARAVAN CARAVAN         0  0.000000e+00             1.000000000

Here we are seeing the remaining variable’s and seeing their significance level with our response variables and all of the variables are significant at .05.

Now we will further plot a correlation matrix to make sure there is no co-linearity among the variables with respect to our response variable

corrplot(cor(subset(new_train_2 , select = c(-CARAVAN))), method = "square", type = "upper")

cor(df_train$PPERSAUT, df_train$APERSAUT)
## [1] 0.9161545
cor(df_train[ , c("PPERSAUT", "APERSAUT")], df_train[ , "CARAVAN"])
##            CARAVAN
## PPERSAUT 0.1509097
## APERSAUT 0.1442105
train_2 = data.frame(train_2[ , !colnames(train_2) %in% c("APERSAUT")])
paste0("after removing APERSAUT dimension is", length(train_2))
## [1] "after removing APERSAUT dimension is56"

There is a high correlation between car policies and number of car policies which we will exclude as variables which have less correlation with response variable. It’s not always necessary to see how much it relates which response variable but it’s good as if it tells us how much response variable changes for given predictor. Contribution car policies is more correlated with response variable so we exclude Number of car policies

# final variable selected
names(train_2)
##  [1] "MOSTYPE"  "MGODRK"   "MGODPR"   "MGODOV"   "MRELSA"   "MRELOV"  
##  [7] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [13] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [19] "MSKB1"    "MSKB2"    "MSKC"     "MSKD"     "MHKOOP"   "MAUT1"   
## [25] "MAUT2"    "MAUT0"    "MZFONDS"  "MINKM30"  "MINK3045" "MINK4575"
## [31] "MINK7512" "MINK123M" "MKOOPKLA" "PWABEDR"  "PWALAND"  "PPERSAUT"
## [37] "PVRAAUT"  "PAANHANG" "PWERKT"   "PPERSONG" "PWAOREG"  "PPLEZIER"
## [43] "AWAPART"  "ABESAUT"  "AMOTSCO"  "ATRACTOR" "ABROM"    "ALEVEN"  
## [49] "AGEZONG"  "ABRAND"   "AZEILPL"  "APLEZIER" "AFIETS"   "AINBOED" 
## [55] "ABYSTAND" "CARAVAN"
length(train_2)
## [1] 56

After having removed all the variables which are highly correlated with each other we are left with 56 variables of importance

step.wise1 = glm(CARAVAN ~ ., data = train_2, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(step.wise1)
## 
## Call:
## glm(formula = CARAVAN ~ ., family = binomial(link = "logit"), 
##     data = train_2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7081  -0.5318   0.0001   0.6927   2.0713  
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.450e+00  4.510e+03  -0.001 0.999213    
## MOSTYPE2     2.357e-01  8.749e-01   0.269 0.787613    
## MOSTYPE3     8.912e-01  8.058e-01   1.106 0.268786    
## MOSTYPE4    -9.496e-01  8.834e-01  -1.075 0.282404    
## MOSTYPE5     5.134e+00  1.963e+00   2.616 0.008901 ** 
## MOSTYPE6     6.354e-01  3.261e-01   1.948 0.051363 .  
## MOSTYPE7    -3.775e-01  8.469e-01  -0.446 0.655804    
## MOSTYPE8     1.467e+00  5.691e-01   2.577 0.009970 ** 
## MOSTYPE9     6.266e+00  1.889e+00   3.317 0.000909 ***
## MOSTYPE10    3.363e-02  3.194e-01   0.105 0.916135    
## MOSTYPE11   -2.808e-01  7.973e-01  -0.352 0.724703    
## MOSTYPE12    2.043e+00  5.871e-01   3.480 0.000502 ***
## MOSTYPE13    4.911e-01  7.989e-01   0.615 0.538759    
## MOSTYPE15   -2.487e+01  3.096e+03  -0.008 0.993591    
## MOSTYPE16   -2.241e+01  1.718e+03  -0.013 0.989590    
## MOSTYPE17   -1.674e+01  1.825e+03  -0.009 0.992684    
## MOSTYPE18   -2.465e+01  1.596e+03  -0.015 0.987679    
## MOSTYPE19   -1.342e+01  2.911e+03  -0.005 0.996322    
## MOSTYPE20   -1.174e+01  1.353e+03  -0.009 0.993077    
## MOSTYPE21   -2.475e+01  1.728e+03  -0.014 0.988573    
## MOSTYPE22   -1.161e+01  1.353e+03  -0.009 0.993153    
## MOSTYPE23    5.188e+00  1.939e+00   2.676 0.007448 ** 
## MOSTYPE24   -1.160e+01  1.353e+03  -0.009 0.993162    
## MOSTYPE25   -1.120e+01  1.353e+03  -0.008 0.993398    
## MOSTYPE26   -1.003e+01  1.353e+03  -0.007 0.994083    
## MOSTYPE27   -1.262e+01  1.353e+03  -0.009 0.992558    
## MOSTYPE28   -3.110e+01  2.383e+03  -0.013 0.989587    
## MOSTYPE29    6.729e+00  1.967e+00   3.421 0.000623 ***
## MOSTYPE30   -1.006e+01  1.353e+03  -0.007 0.994065    
## MOSTYPE31   -9.197e+00  1.353e+03  -0.007 0.994577    
## MOSTYPE32   -8.997e+00  1.353e+03  -0.007 0.994694    
## MOSTYPE33    7.609e+00  1.938e+00   3.927 8.62e-05 ***
## MOSTYPE34    7.385e-01  8.150e-01   0.906 0.364913    
## MOSTYPE35    5.400e+00  1.849e+00   2.920 0.003496 ** 
## MOSTYPE36    7.741e+00  1.942e+00   3.986 6.72e-05 ***
## MOSTYPE37    7.191e+00  1.897e+00   3.792 0.000150 ***
## MOSTYPE38    7.522e+00  1.888e+00   3.984 6.77e-05 ***
## MOSTYPE39    6.441e+00  1.845e+00   3.492 0.000480 ***
## MOSTYPE40   -1.124e+01  6.541e+02  -0.017 0.986294    
## MOSTYPE41    5.267e+00  1.901e+00   2.771 0.005594 ** 
## MGODRK1      7.030e-01  1.213e-01   5.794 6.89e-09 ***
## MGODRK2      9.424e-01  1.280e-01   7.361 1.82e-13 ***
## MGODRK3     -1.204e+00  2.944e-01  -4.090 4.31e-05 ***
## MGODRK4     -1.305e+00  5.893e-01  -2.215 0.026752 *  
## MGODRK5      1.979e+00  6.371e-01   3.107 0.001889 ** 
## MGODRK6      2.481e+00  8.038e-01   3.087 0.002023 ** 
## MGODRK7     -1.502e+01  2.169e+03  -0.007 0.994474    
## MGODRK8     -1.270e+01  2.986e+03  -0.004 0.996607    
## MGODRK9     -1.226e+01  2.478e+03  -0.005 0.996053    
## MGODPR1      6.325e-01  6.817e-01   0.928 0.353544    
## MGODPR2      2.728e+00  6.276e-01   4.346 1.39e-05 ***
## MGODPR3      2.965e+00  6.131e-01   4.837 1.32e-06 ***
## MGODPR4      3.146e+00  6.143e-01   5.120 3.05e-07 ***
## MGODPR5      3.436e+00  6.136e-01   5.600 2.14e-08 ***
## MGODPR6      2.709e+00  6.028e-01   4.494 6.99e-06 ***
## MGODPR7      4.102e+00  6.214e-01   6.602 4.06e-11 ***
## MGODPR8      3.069e+00  7.053e-01   4.350 1.36e-05 ***
## MGODPR9      4.140e+00  6.635e-01   6.240 4.37e-10 ***
## MGODOV1     -4.422e-01  1.180e-01  -3.747 0.000179 ***
## MGODOV2      7.438e-02  1.191e-01   0.624 0.532382    
## MGODOV3      6.231e-01  1.981e-01   3.146 0.001655 ** 
## MGODOV4     -5.756e-01  3.211e-01  -1.793 0.073001 .  
## MGODOV5      2.210e+00  7.124e-01   3.102 0.001924 ** 
## MRELSA1     -2.780e-02  1.109e-01  -0.251 0.802101    
## MRELSA2      1.148e-01  1.269e-01   0.905 0.365661    
## MRELSA3      9.149e-01  2.658e-01   3.442 0.000578 ***
## MRELSA4     -2.655e+00  5.598e-01  -4.742 2.11e-06 ***
## MRELSA5     -1.712e+01  1.311e+03  -0.013 0.989582    
## MRELSA6     -1.621e+01  1.504e+03  -0.011 0.991401    
## MRELSA7     -1.523e+01  6.523e+03  -0.002 0.998137    
## MRELOV1     -8.892e-02  1.844e-01  -0.482 0.629654    
## MRELOV2     -2.407e-01  1.469e-01  -1.639 0.101290    
## MRELOV3     -4.754e-01  1.651e-01  -2.880 0.003978 ** 
## MRELOV4     -8.582e-01  2.157e-01  -3.979 6.93e-05 ***
## MRELOV5     -1.502e+00  3.307e-01  -4.544 5.53e-06 ***
## MRELOV6     -2.860e+00  4.556e-01  -6.277 3.46e-10 ***
## MRELOV7     -3.892e+00  6.326e-01  -6.152 7.66e-10 ***
## MRELOV8     -1.509e+01  7.680e+02  -0.020 0.984328    
## MRELOV9      1.960e+01  5.898e+02   0.033 0.973486    
## MFALLEEN1   -1.122e-01  1.474e-01  -0.761 0.446783    
## MFALLEEN2   -5.165e-02  1.974e-01  -0.262 0.793590    
## MFALLEEN3    8.372e-02  2.728e-01   0.307 0.758928    
## MFALLEEN4    4.383e-01  3.851e-01   1.138 0.255027    
## MFALLEEN5   -2.181e-01  5.167e-01  -0.422 0.672883    
## MFALLEEN6    1.376e+00  6.638e-01   2.074 0.038108 *  
## MFALLEEN7   -1.730e+01  5.898e+02  -0.029 0.976601    
## MFALLEEN8    2.637e+00  1.032e+00   2.555 0.010607 *  
## MFALLEEN9   -1.491e+01  8.832e+02  -0.017 0.986530    
## MFGEKIND1    1.281e+00  3.478e-01   3.682 0.000231 ***
## MFGEKIND2    6.275e-01  3.649e-01   1.720 0.085485 .  
## MFGEKIND3    6.645e-01  4.014e-01   1.655 0.097874 .  
## MFGEKIND4    8.021e-01  4.518e-01   1.776 0.075804 .  
## MFGEKIND5    2.664e-01  5.305e-01   0.502 0.615605    
## MFGEKIND6   -9.850e-01  6.095e-01  -1.616 0.106034    
## MFGEKIND7    5.140e-01  7.529e-01   0.683 0.494852    
## MFGEKIND8    4.297e+00  1.197e+00   3.588 0.000333 ***
## MFGEKIND9   -1.831e+00  1.440e+00  -1.271 0.203708    
## MFWEKIND1   -1.813e+00  4.651e-01  -3.898 9.68e-05 ***
## MFWEKIND2   -1.139e+00  4.463e-01  -2.552 0.010697 *  
## MFWEKIND3   -1.699e+00  4.849e-01  -3.504 0.000458 ***
## MFWEKIND4   -2.050e+00  5.426e-01  -3.778 0.000158 ***
## MFWEKIND5   -2.600e+00  5.951e-01  -4.370 1.24e-05 ***
## MFWEKIND6   -1.978e+00  6.704e-01  -2.950 0.003178 ** 
## MFWEKIND7   -2.432e+00  7.479e-01  -3.252 0.001147 ** 
## MFWEKIND8   -1.951e+00  8.512e-01  -2.292 0.021911 *  
## MFWEKIND9   -1.151e+00  9.157e-01  -1.257 0.208580    
## MOPLHOOG1    7.477e-01  1.366e-01   5.473 4.41e-08 ***
## MOPLHOOG2    1.074e-01  1.827e-01   0.588 0.556613    
## MOPLHOOG3   -3.639e-01  2.811e-01  -1.295 0.195469    
## MOPLHOOG4    3.225e-01  3.627e-01   0.889 0.373927    
## MOPLHOOG5    6.917e-01  4.627e-01   1.495 0.134942    
## MOPLHOOG6   -2.124e+00  6.704e-01  -3.169 0.001530 ** 
## MOPLHOOG7   -3.700e-01  7.939e-01  -0.466 0.641120    
## MOPLHOOG8   -3.174e-01  8.979e-01  -0.354 0.723708    
## MOPLHOOG9   -1.018e+00  1.223e+00  -0.832 0.405170    
## MOPLMIDD1   -4.894e-01  4.190e-01  -1.168 0.242807    
## MOPLMIDD2   -6.603e-01  3.925e-01  -1.682 0.092505 .  
## MOPLMIDD3   -1.215e+00  4.317e-01  -2.815 0.004874 ** 
## MOPLMIDD4   -1.758e+00  4.737e-01  -3.711 0.000206 ***
## MOPLMIDD5   -1.594e+00  5.379e-01  -2.963 0.003043 ** 
## MOPLMIDD6   -1.622e+00  6.281e-01  -2.583 0.009805 ** 
## MOPLMIDD7   -1.781e+00  7.521e-01  -2.367 0.017914 *  
## MOPLMIDD8   -2.042e+00  8.919e-01  -2.289 0.022058 *  
## MOPLMIDD9   -2.408e+00  9.724e-01  -2.476 0.013290 *  
## MOPLLAAG1   -1.520e+00  3.958e-01  -3.840 0.000123 ***
## MOPLLAAG2   -9.887e-01  3.880e-01  -2.548 0.010831 *  
## MOPLLAAG3   -1.092e+00  4.230e-01  -2.581 0.009840 ** 
## MOPLLAAG4   -2.117e+00  4.860e-01  -4.356 1.32e-05 ***
## MOPLLAAG5   -2.370e+00  5.546e-01  -4.273 1.93e-05 ***
## MOPLLAAG6   -2.533e+00  6.277e-01  -4.036 5.44e-05 ***
## MOPLLAAG7   -3.256e+00  7.309e-01  -4.455 8.38e-06 ***
## MOPLLAAG8   -2.833e+00  8.593e-01  -3.297 0.000979 ***
## MOPLLAAG9   -5.633e+00  9.488e-01  -5.937 2.90e-09 ***
## MBERHOOG1   -4.298e-01  1.486e-01  -2.891 0.003836 ** 
## MBERHOOG2    5.324e-01  1.664e-01   3.199 0.001379 ** 
## MBERHOOG3    2.995e-01  2.233e-01   1.341 0.179828    
## MBERHOOG4    1.576e+00  2.915e-01   5.408 6.37e-08 ***
## MBERHOOG5    9.862e-01  3.906e-01   2.525 0.011565 *  
## MBERHOOG6    3.688e+00  4.841e-01   7.617 2.59e-14 ***
## MBERHOOG7    3.776e+00  5.992e-01   6.301 2.95e-10 ***
## MBERHOOG8    7.435e+00  9.388e-01   7.919 2.39e-15 ***
## MBERHOOG9    3.098e+00  8.571e-01   3.615 0.000301 ***
## MBERZELF1    6.746e-01  1.221e-01   5.523 3.33e-08 ***
## MBERZELF2    7.308e-01  1.897e-01   3.852 0.000117 ***
## MBERZELF3    9.332e-01  5.111e-01   1.826 0.067883 .  
## MBERZELF4    2.150e+00  8.451e-01   2.544 0.010945 *  
## MBERZELF5    1.407e+00  8.222e-01   1.711 0.087010 .  
## MBERBOER1    7.390e-01  1.245e-01   5.936 2.92e-09 ***
## MBERBOER2    1.394e+00  1.996e-01   6.987 2.82e-12 ***
## MBERBOER3    3.230e+00  3.737e-01   8.643  < 2e-16 ***
## MBERBOER4    9.550e-01  6.170e-01   1.548 0.121691    
## MBERBOER5    4.783e+00  7.229e-01   6.616 3.68e-11 ***
## MBERBOER6   -1.047e+01  1.167e+03  -0.009 0.992841    
## MBERBOER7    4.427e+00  3.197e+03   0.001 0.998895    
## MBERBOER8    1.236e+01  2.543e+03   0.005 0.996120    
## MBERBOER9    9.109e+00  3.305e+03   0.003 0.997801    
## MBERMIDD1    1.312e-02  2.716e-01   0.048 0.961475    
## MBERMIDD2    1.028e+00  2.376e-01   4.325 1.52e-05 ***
## MBERMIDD3    1.011e+00  2.759e-01   3.662 0.000250 ***
## MBERMIDD4    2.199e+00  3.237e-01   6.792 1.11e-11 ***
## MBERMIDD5    2.071e+00  3.911e-01   5.295 1.19e-07 ***
## MBERMIDD6    3.543e+00  4.748e-01   7.462 8.53e-14 ***
## MBERMIDD7    3.913e+00  5.822e-01   6.721 1.81e-11 ***
## MBERMIDD8   -1.933e+01  1.273e+03  -0.015 0.987880    
## MBERMIDD9    5.097e+00  7.254e-01   7.027 2.11e-12 ***
## MBERARBG1    1.485e+00  1.644e-01   9.031  < 2e-16 ***
## MBERARBG2    1.259e+00  1.915e-01   6.576 4.84e-11 ***
## MBERARBG3    1.034e+00  2.427e-01   4.262 2.03e-05 ***
## MBERARBG4    1.634e+00  3.052e-01   5.353 8.64e-08 ***
## MBERARBG5    2.963e+00  3.883e-01   7.630 2.35e-14 ***
## MBERARBG6    1.114e+00  4.935e-01   2.258 0.023951 *  
## MBERARBG7    5.194e+00  6.536e-01   7.947 1.90e-15 ***
## MBERARBG8    4.121e+00  7.117e-01   5.790 7.05e-09 ***
## MBERARBG9    6.042e+00  9.366e-01   6.451 1.11e-10 ***
## MBERARBO1    7.554e-03  1.651e-01   0.046 0.963514    
## MBERARBO2    2.503e-01  1.861e-01   1.345 0.178733    
## MBERARBO3    7.295e-01  2.297e-01   3.175 0.001497 ** 
## MBERARBO4    1.779e+00  2.906e-01   6.121 9.28e-10 ***
## MBERARBO5    2.164e+00  3.927e-01   5.512 3.56e-08 ***
## MBERARBO6    1.532e+00  4.869e-01   3.147 0.001648 ** 
## MBERARBO7    3.781e+00  6.914e-01   5.470 4.51e-08 ***
## MBERARBO8    4.360e+00  1.182e+00   3.689 0.000225 ***
## MBERARBO9   -1.079e+01  1.045e+03  -0.010 0.991759    
## MSKB11       2.628e-01  1.514e-01   1.736 0.082483 .  
## MSKB12      -5.124e-01  1.563e-01  -3.278 0.001046 ** 
## MSKB13      -1.925e-01  1.842e-01  -1.046 0.295783    
## MSKB14      -1.041e+00  2.654e-01  -3.924 8.72e-05 ***
## MSKB15      -4.193e-01  3.899e-01  -1.076 0.282112    
## MSKB16       9.354e-01  4.931e-01   1.897 0.057835 .  
## MSKB17      -1.747e+01  1.994e+03  -0.009 0.993008    
## MSKB18       1.399e+00  8.087e-01   1.730 0.083718 .  
## MSKB19      -1.970e+01  1.324e+03  -0.015 0.988123    
## MSKB21      -6.648e-01  1.806e-01  -3.681 0.000232 ***
## MSKB22      -4.053e-01  1.660e-01  -2.441 0.014650 *  
## MSKB23      -8.808e-01  1.941e-01  -4.538 5.67e-06 ***
## MSKB24      -3.325e-01  2.132e-01  -1.560 0.118866    
## MSKB25      -7.836e-01  2.779e-01  -2.820 0.004798 ** 
## MSKB26      -9.965e-01  4.783e-01  -2.084 0.037204 *  
## MSKB27      -1.734e+01  1.992e+03  -0.009 0.993055    
## MSKB28      -2.199e+01  2.096e+03  -0.010 0.991631    
## MSKB29       9.281e+00  5.601e+03   0.002 0.998678    
## MSKC1        1.686e+00  3.841e-01   4.390 1.13e-05 ***
## MSKC2        1.744e+00  3.646e-01   4.782 1.74e-06 ***
## MSKC3        1.453e+00  3.654e-01   3.976 7.00e-05 ***
## MSKC4        1.712e+00  3.879e-01   4.414 1.02e-05 ***
## MSKC5        1.325e+00  4.119e-01   3.216 0.001301 ** 
## MSKC6        1.681e+00  4.554e-01   3.692 0.000223 ***
## MSKC7        2.239e+00  5.240e-01   4.273 1.93e-05 ***
## MSKC8        2.832e+00  5.861e-01   4.832 1.35e-06 ***
## MSKC9        1.080e+00  6.355e-01   1.700 0.089166 .  
## MSKD1       -1.564e-01  1.173e-01  -1.333 0.182407    
## MSKD2       -2.059e-01  1.449e-01  -1.421 0.155353    
## MSKD3       -8.769e-01  2.247e-01  -3.903 9.51e-05 ***
## MSKD4       -4.739e-01  3.559e-01  -1.331 0.183091    
## MSKD5       -3.516e+00  9.437e-01  -3.726 0.000195 ***
## MSKD6       -1.777e+01  5.875e+02  -0.030 0.975875    
## MSKD7       -5.522e-01  1.196e+00  -0.462 0.644364    
## MSKD9       -1.417e+01  6.523e+03  -0.002 0.998267    
## MHKOOP1     -9.229e-01  2.222e-01  -4.153 3.28e-05 ***
## MHKOOP2     -1.565e+00  2.048e-01  -7.642 2.14e-14 ***
## MHKOOP3     -7.291e-01  2.027e-01  -3.596 0.000323 ***
## MHKOOP4     -1.166e+00  2.074e-01  -5.622 1.89e-08 ***
## MHKOOP5     -9.826e-01  1.964e-01  -5.002 5.68e-07 ***
## MHKOOP6     -4.430e-01  2.006e-01  -2.208 0.027225 *  
## MHKOOP7     -1.256e+00  1.945e-01  -6.455 1.08e-10 ***
## MHKOOP8     -8.889e-01  2.118e-01  -4.197 2.71e-05 ***
## MHKOOP9     -3.542e-01  1.975e-01  -1.794 0.072837 .  
## MAUT11       7.182e+00  7.930e+03   0.001 0.999277    
## MAUT12       1.142e+01  4.302e+03   0.003 0.997882    
## MAUT13       8.272e+00  4.302e+03   0.002 0.998466    
## MAUT14       1.189e+01  4.302e+03   0.003 0.997796    
## MAUT15       1.245e+01  4.302e+03   0.003 0.997690    
## MAUT16       1.220e+01  4.302e+03   0.003 0.997737    
## MAUT17       1.205e+01  4.302e+03   0.003 0.997766    
## MAUT18       1.186e+01  4.302e+03   0.003 0.997801    
## MAUT19       1.061e+01  4.302e+03   0.002 0.998032    
## MAUT21       2.048e-01  1.438e-01   1.424 0.154311    
## MAUT22      -5.603e-01  2.131e-01  -2.629 0.008562 ** 
## MAUT23      -1.414e+00  3.424e-01  -4.130 3.63e-05 ***
## MAUT24      -6.115e-01  4.561e-01  -1.341 0.180018    
## MAUT25      -1.521e+00  7.340e-01  -2.072 0.038304 *  
## MAUT26       1.689e+00  1.412e+00   1.196 0.231670    
## MAUT27       4.722e+00  6.600e+03   0.001 0.999429    
## MAUT01      -7.409e-01  1.867e-01  -3.969 7.23e-05 ***
## MAUT02      -4.470e-01  2.235e-01  -2.000 0.045509 *  
## MAUT03      -4.565e-01  3.369e-01  -1.355 0.175423    
## MAUT04      -2.240e+00  4.569e-01  -4.904 9.41e-07 ***
## MAUT05      -1.108e+00  6.313e-01  -1.755 0.079331 .  
## MAUT06       3.528e+00  1.027e+00   3.437 0.000589 ***
## MAUT07      -4.590e+01  1.104e+03  -0.042 0.966821    
## MAUT08      -1.159e+01  6.750e+03  -0.002 0.998630    
## MAUT09      -7.937e+00  4.820e+03  -0.002 0.998686    
## MZFONDS1    -1.793e+01  1.253e+03  -0.014 0.988580    
## MZFONDS2     4.261e-01  4.114e-01   1.036 0.300278    
## MZFONDS3    -1.769e-01  4.294e-01  -0.412 0.680374    
## MZFONDS4    -7.074e-02  4.214e-01  -0.168 0.866684    
## MZFONDS5     2.833e-01  4.234e-01   0.669 0.503475    
## MZFONDS6    -4.051e-01  4.246e-01  -0.954 0.339967    
## MZFONDS7     5.477e-01  4.287e-01   1.278 0.201385    
## MZFONDS8     1.010e+00  4.474e-01   2.258 0.023939 *  
## MZFONDS9     6.235e-01  4.498e-01   1.386 0.165690    
## MINKM301     7.072e-02  1.597e-01   0.443 0.657930    
## MINKM302     2.935e-01  1.794e-01   1.636 0.101887    
## MINKM303    -1.269e+00  2.504e-01  -5.070 3.98e-07 ***
## MINKM304    -1.904e+00  3.356e-01  -5.673 1.41e-08 ***
## MINKM305    -1.644e+00  4.212e-01  -3.903 9.51e-05 ***
## MINKM306    -1.134e+00  5.111e-01  -2.219 0.026490 *  
## MINKM307    -2.092e+00  6.618e-01  -3.162 0.001569 ** 
## MINKM308    -3.000e+00  8.073e-01  -3.716 0.000202 ***
## MINKM309    -1.956e+01  5.913e+02  -0.033 0.973619    
## MINK30451   -1.095e-01  3.015e-01  -0.363 0.716430    
## MINK30452   -4.677e-01  2.642e-01  -1.770 0.076758 .  
## MINK30453   -9.844e-01  3.121e-01  -3.154 0.001611 ** 
## MINK30454   -1.510e+00  3.603e-01  -4.192 2.77e-05 ***
## MINK30455   -2.055e+00  4.456e-01  -4.612 3.98e-06 ***
## MINK30456   -2.658e+00  5.315e-01  -5.001 5.69e-07 ***
## MINK30457   -2.673e+00  6.216e-01  -4.300 1.71e-05 ***
## MINK30458   -2.864e+00  8.028e-01  -3.567 0.000361 ***
## MINK30459   -8.936e-01  7.864e-01  -1.136 0.255802    
## MINK45751   -4.089e-01  2.135e-01  -1.915 0.055510 .  
## MINK45752   -4.238e-01  2.128e-01  -1.992 0.046401 *  
## MINK45753   -3.178e-01  2.649e-01  -1.200 0.230241    
## MINK45754   -1.026e+00  3.321e-01  -3.089 0.002007 ** 
## MINK45755   -1.049e+00  4.309e-01  -2.434 0.014932 *  
## MINK45756   -1.605e+00  5.160e-01  -3.110 0.001869 ** 
## MINK45757   -2.377e+00  6.419e-01  -3.703 0.000213 ***
## MINK45758   -3.514e+00  7.744e-01  -4.538 5.68e-06 ***
## MINK45759   -3.860e+00  8.202e-01  -4.706 2.53e-06 ***
## MINK75121    4.807e-01  1.154e-01   4.166 3.10e-05 ***
## MINK75122   -9.441e-02  1.663e-01  -0.568 0.570276    
## MINK75123   -1.022e+00  2.698e-01  -3.788 0.000152 ***
## MINK75124   -6.879e-01  3.651e-01  -1.884 0.059519 .  
## MINK75125   -2.128e+00  5.301e-01  -4.015 5.95e-05 ***
## MINK75126   -1.935e+01  1.649e+03  -0.012 0.990637    
## MINK75127   -2.080e+01  6.523e+03  -0.003 0.997455    
## MINK75128   -2.180e+01  2.372e+03  -0.009 0.992667    
## MINK75129    3.470e+00  1.257e+00   2.760 0.005785 ** 
## MINK123M1   -5.495e-01  1.419e-01  -3.873 0.000107 ***
## MINK123M2   -8.495e-01  3.338e-01  -2.545 0.010937 *  
## MINK123M3   -3.197e+00  7.446e-01  -4.294 1.76e-05 ***
## MINK123M4   -2.144e+01  1.008e+03  -0.021 0.983027    
## MINK123M5   -9.483e-02  6.656e+03   0.000 0.999989    
## MINK123M7   -2.423e+01  6.523e+03  -0.004 0.997036    
## MINK123M9    3.211e-01  6.656e+03   0.000 0.999962    
## MKOOPKLA2    1.356e+00  7.787e-01   1.742 0.081587 .  
## MKOOPKLA3   -1.643e+01  1.353e+03  -0.012 0.990314    
## MKOOPKLA4   -1.568e+01  1.353e+03  -0.012 0.990755    
## MKOOPKLA5   -1.533e+01  1.353e+03  -0.011 0.990958    
## MKOOPKLA6   -9.943e+00  1.353e+03  -0.007 0.994136    
## MKOOPKLA7   -1.038e+01  1.353e+03  -0.008 0.993878    
## MKOOPKLA8   -1.029e+01  1.353e+03  -0.008 0.993934    
## PWABEDR1    -1.643e+01  2.052e+03  -0.008 0.993612    
## PWABEDR2     1.593e+00  4.875e-01   3.268 0.001084 ** 
## PWABEDR3     9.570e-01  5.578e-01   1.716 0.086222 .  
## PWABEDR4    -1.897e+01  1.226e+03  -0.015 0.987658    
## PWABEDR5     1.111e+01  9.541e+03   0.001 0.999071    
## PWABEDR6    -1.651e+01  3.233e+03  -0.005 0.995924    
## PWALAND2    -1.751e+01  3.522e+03  -0.005 0.996033    
## PWALAND3    -1.839e+00  4.915e-01  -3.742 0.000183 ***
## PWALAND4    -1.821e+00  5.563e-01  -3.274 0.001062 ** 
## PPERSAUT4   -1.883e+01  6.523e+03  -0.003 0.997696    
## PPERSAUT5   -5.477e-02  1.280e-01  -0.428 0.668716    
## PPERSAUT6    2.005e+00  7.165e-02  27.988  < 2e-16 ***
## PPERSAUT7   -1.781e+01  7.944e+02  -0.022 0.982112    
## PPERSAUT8   -1.768e+01  4.600e+03  -0.004 0.996933    
## PVRAAUT4    -1.357e+00  6.637e+03   0.000 0.999837    
## PVRAAUT6    -1.802e+01  2.439e+03  -0.007 0.994104    
## PVRAAUT9     2.729e+00  6.571e+03   0.000 0.999669    
## PAANHANG1    4.933e-02  5.405e-01   0.091 0.927272    
## PAANHANG2    1.501e-01  4.553e-01   0.330 0.741550    
## PAANHANG3   -1.662e+01  2.450e+03  -0.007 0.994588    
## PAANHANG4   -2.016e+01  6.523e+03  -0.003 0.997534    
## PAANHANG5   -3.358e+00  6.964e+03   0.000 0.999615    
## PWERKT2     -1.529e+01  2.280e+03  -0.007 0.994650    
## PWERKT3     -1.576e+01  2.035e+03  -0.008 0.993818    
## PWERKT4     -1.795e+01  1.855e+03  -0.010 0.992279    
## PWERKT6     -1.209e+01  6.523e+03  -0.002 0.998521    
## PPERSONG1   -1.880e+01  3.291e+03  -0.006 0.995441    
## PPERSONG2   -3.514e+00  9.973e-01  -3.523 0.000426 ***
## PPERSONG3   -2.013e+01  2.338e+03  -0.009 0.993131    
## PPERSONG4   -1.632e+01  3.084e+03  -0.005 0.995778    
## PPERSONG5   -1.422e+01  6.523e+03  -0.002 0.998261    
## PPERSONG6   -1.586e+01  3.671e+03  -0.004 0.996553    
## PWAOREG4    -1.906e+01  6.523e+03  -0.003 0.997669    
## PWAOREG5    -1.233e+01  6.523e+03  -0.002 0.998492    
## PWAOREG6     3.777e+00  4.798e-01   7.872 3.50e-15 ***
## PWAOREG7    -2.153e+01  5.589e+03  -0.004 0.996927    
## PPLEZIER1    6.305e+00  1.595e+00   3.952 7.75e-05 ***
## PPLEZIER2    9.672e-01  1.521e+00   0.636 0.524861    
## PPLEZIER3    4.478e+00  1.488e+00   3.008 0.002626 ** 
## PPLEZIER4    1.757e+00  1.253e+00   1.402 0.160955    
## PPLEZIER5   -1.444e+01  3.623e+03  -0.004 0.996819    
## PPLEZIER6    4.053e+01  8.550e+02   0.047 0.962195    
## AWAPART1     3.762e-01  8.209e-02   4.583 4.58e-06 ***
## AWAPART2    -1.798e+01  1.920e+03  -0.009 0.992528    
## ABESAUT1     1.006e+00  4.200e-01   2.395 0.016608 *  
## ABESAUT2    -1.675e+01  2.618e+03  -0.006 0.994894    
## ABESAUT3    -1.392e+01  2.721e+03  -0.005 0.995918    
## ABESAUT4     4.541e+01  1.011e+04   0.004 0.996415    
## AMOTSCO1     5.853e-02  1.575e-01   0.372 0.710210    
## AMOTSCO2     9.850e-03  5.264e-01   0.019 0.985072    
## AMOTSCO8    -1.800e+01  6.523e+03  -0.003 0.997798    
## ATRACTOR1   -7.002e-01  3.318e-01  -2.110 0.034858 *  
## ATRACTOR2    2.363e-01  6.651e-01   0.355 0.722347    
## ATRACTOR3   -1.566e+01  2.178e+03  -0.007 0.994264    
## ATRACTOR4   -1.434e+01  2.010e+03  -0.007 0.994307    
## ABROM1      -7.609e-01  1.628e-01  -4.673 2.97e-06 ***
## ABROM2      -1.724e+01  1.472e+03  -0.012 0.990660    
## ALEVEN1     -9.627e-01  1.964e-01  -4.902 9.46e-07 ***
## ALEVEN2     -1.692e-01  2.089e-01  -0.810 0.417797    
## ALEVEN3      1.454e-02  4.856e-01   0.030 0.976111    
## ALEVEN4      2.092e+00  6.509e-01   3.214 0.001308 ** 
## ALEVEN8      1.717e+01  8.003e+03   0.002 0.998288    
## AGEZONG1     3.194e-01  3.252e-01   0.982 0.325952    
## ABRAND1      5.744e-01  8.376e-02   6.858 6.98e-12 ***
## ABRAND2      7.562e-02  2.310e-01   0.327 0.743435    
## ABRAND3     -1.917e+01  1.682e+03  -0.011 0.990904    
## ABRAND4     -1.776e+01  2.707e+03  -0.007 0.994765    
## ABRAND5     -8.689e+00  2.752e+03  -0.003 0.997481    
## ABRAND7     -1.703e+01  6.523e+03  -0.003 0.997917    
## AZEILPL1    -1.844e+01  1.833e+03  -0.010 0.991975    
## APLEZIER1    8.712e-02  1.341e+00   0.065 0.948190    
## APLEZIER2           NA         NA      NA       NA    
## AFIETS1      1.462e+00  2.241e-01   6.525 6.80e-11 ***
## AFIETS2      2.062e-01  3.536e-01   0.583 0.559904    
## AFIETS3      1.601e+00  1.384e+00   1.157 0.247467    
## AINBOED1    -4.217e-01  3.385e-01  -1.246 0.212785    
## AINBOED2    -2.158e+01  6.523e+03  -0.003 0.997360    
## ABYSTAND1    1.042e+00  2.132e-01   4.887 1.02e-06 ***
## ABYSTAND2   -1.837e+01  6.523e+03  -0.003 0.997752    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177.2  on 10947  degrees of freedom
## Residual deviance:  8889.3  on 10559  degrees of freedom
## AIC: 9667.3
## 
## Number of Fisher Scoring iterations: 17
# difference between null deviance. = 15177 - 12386 = 2791
# step(step.wise1, direction = "backwards")

Here we have applied GLM to our 56 variables, because it must be remembered that our dataset is categorical in nature. After having run the regression we have see that most of the variables are significant. This means that the dimension reduction we took earlier is helping us to come up with a good model

Now we went one step further and applied a step-wise backwards regression which further helped us reduce variables from 56 to 41 variables with error in null deviance to 2783

In the next step we will take the 41 variables which we got from step-wise

for(i in 1:ncol(train_2)){
train_2[,i] <- as.factor(train_2[,i])
}

Model 2

model.2 = glm(formula = CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV + 
    MRELGE + MRELSA + MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF + 
    MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKC + MSKD + 
    MHKOOP + MAUT1 + MAUT2 + MAUT0 + MINK3045 + MINK7512 + MINK123M + 
    MKOOPKLA + PPERSAUT + PMOTSCO + PVRAAUT + PAANHANG + PWERKT + 
    PWAOREG + PPLEZIER + AWAPART + AWALAND + ABROM + ALEVEN + 
    APERSONG + AGEZONG + ABRAND + APLEZIER + AFIETS + ABYSTAND, 
    family = binomial(link = "logit"), data = over_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.2$xlevels[["MSKD"]] <- union(model.2$xlevels[["MSKD"]], levels(over_test$MSKD))
model.2$xlevels[["MAUT2"]] <- union(model.2$xlevels[["MAUT2"]], levels(over_test$MAUT2))
model.2$xlevels[["MINK123M"]] <- union(model.2$xlevels[["MINK123M"]], levels(over_test$MINK123M))
model.2$xlevels[["PPERSAUT"]] <- union(model.2$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
model.2$xlevels[["PVRAAUT"]] <- union(model.2$xlevels[["PVRAAUT"]], levels(over_test$PVRAAUT))
model.2$xlevels[["PWERKT"]] <- union(model.2$xlevels[["PWERKT"]], levels(over_test$PWERKT))
model.2$xlevels[["ABROM"]] <- union(model.2$xlevels[["ABROM"]], levels(over_test$ABROM))
model.2$xlevels[["ALEVEN"]] <- union(model.2$xlevels[["ALEVEN"]], levels(over_test$ALEVEN))
model.2$xlevels[["ABRAND"]] <- union(model.2$xlevels[["ABRAND"]], levels(over_test$ABRAND))
model.2$xlevels[["AFIETS"]] <- union(model.2$xlevels[["AFIETS"]], levels(over_test$AFIETS))

drewSummary(model.2)
## 
## Call:
## glm(formula = CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV + 
##     MRELGE + MRELSA + MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF + 
##     MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKC + MSKD + 
##     MHKOOP + MAUT1 + MAUT2 + MAUT0 + MINK3045 + MINK7512 + MINK123M + 
##     MKOOPKLA + PPERSAUT + PMOTSCO + PVRAAUT + PAANHANG + PWERKT + 
##     PWAOREG + PPLEZIER + AWAPART + AWALAND + ABROM + ALEVEN + 
##     APERSONG + AGEZONG + ABRAND + APLEZIER + AFIETS + ABYSTAND, 
##     family = binomial(link = "logit"), data = over_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3299  -0.6490   0.0000   0.7778   1.9839  
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.087e+00  4.635e+03  -0.001 0.998952    
## MOSTYPE2    -8.090e-01  7.424e-01  -1.090 0.275839    
## MOSTYPE3    -3.755e-01  6.901e-01  -0.544 0.586295    
## MOSTYPE4    -1.662e+00  7.409e-01  -2.243 0.024883 *  
## MOSTYPE5     1.506e+00  1.613e+00   0.933 0.350609    
## MOSTYPE6    -7.457e-01  2.646e-01  -2.818 0.004828 ** 
## MOSTYPE7    -1.701e+00  7.313e-01  -2.325 0.020050 *  
## MOSTYPE8     5.591e-01  4.710e-01   1.187 0.235210    
## MOSTYPE9     1.350e+00  1.553e+00   0.869 0.384893    
## MOSTYPE10   -6.468e-01  2.673e-01  -2.419 0.015546 *  
## MOSTYPE11   -4.601e-01  6.821e-01  -0.675 0.499944    
## MOSTYPE12    1.705e+00  4.805e-01   3.548 0.000388 ***
## MOSTYPE13   -1.136e+00  6.870e-01  -1.653 0.098272 .  
## MOSTYPE15   -2.856e+01  3.101e+03  -0.009 0.992653    
## MOSTYPE16   -2.899e+01  1.713e+03  -0.017 0.986495    
## MOSTYPE17   -1.918e+01  1.886e+03  -0.010 0.991885    
## MOSTYPE18   -3.102e+01  1.749e+03  -0.018 0.985849    
## MOSTYPE19   -1.763e+01  3.117e+03  -0.006 0.995486    
## MOSTYPE20   -1.406e+01  1.335e+03  -0.011 0.991597    
## MOSTYPE21   -2.940e+01  1.945e+03  -0.015 0.987938    
## MOSTYPE22   -1.450e+01  1.335e+03  -0.011 0.991330    
## MOSTYPE23    9.836e-01  1.602e+00   0.614 0.539349    
## MOSTYPE24   -1.435e+01  1.335e+03  -0.011 0.991420    
## MOSTYPE25   -1.466e+01  1.335e+03  -0.011 0.991234    
## MOSTYPE26   -1.339e+01  1.335e+03  -0.010 0.991995    
## MOSTYPE27   -1.553e+01  1.335e+03  -0.012 0.990718    
## MOSTYPE28   -3.207e+01  1.667e+03  -0.019 0.984653    
## MOSTYPE29    1.286e+00  1.621e+00   0.793 0.427564    
## MOSTYPE30   -1.362e+01  1.335e+03  -0.010 0.991858    
## MOSTYPE31   -1.379e+01  1.335e+03  -0.010 0.991754    
## MOSTYPE32   -1.340e+01  1.335e+03  -0.010 0.991991    
## MOSTYPE33    2.375e+00  1.591e+00   1.493 0.135507    
## MOSTYPE34   -4.549e-01  7.010e-01  -0.649 0.516387    
## MOSTYPE35    7.191e-01  1.517e+00   0.474 0.635566    
## MOSTYPE36    3.007e+00  1.596e+00   1.884 0.059612 .  
## MOSTYPE37    2.472e+00  1.557e+00   1.588 0.112277    
## MOSTYPE38    2.255e+00  1.551e+00   1.454 0.145964    
## MOSTYPE39    1.555e+00  1.509e+00   1.031 0.302768    
## MOSTYPE40   -1.687e+01  6.559e+02  -0.026 0.979484    
## MOSTYPE41    8.434e-01  1.565e+00   0.539 0.589972    
## MGODRK1      3.333e-01  9.363e-02   3.560 0.000371 ***
## MGODRK2      3.908e-01  1.028e-01   3.800 0.000145 ***
## MGODRK3     -8.136e-01  2.342e-01  -3.474 0.000512 ***
## MGODRK4     -1.401e+00  5.401e-01  -2.594 0.009483 ** 
## MGODRK5      1.709e+00  5.439e-01   3.142 0.001678 ** 
## MGODRK6      7.815e-01  6.353e-01   1.230 0.218655    
## MGODRK7     -1.571e+01  2.372e+03  -0.007 0.994716    
## MGODRK8     -1.468e+01  3.218e+03  -0.005 0.996361    
## MGODRK9     -1.490e+01  2.503e+03  -0.006 0.995251    
## MGODPR1      6.560e-01  5.705e-01   1.150 0.250167    
## MGODPR2      2.539e+00  5.123e-01   4.956 7.20e-07 ***
## MGODPR3      2.541e+00  5.065e-01   5.016 5.27e-07 ***
## MGODPR4      2.833e+00  5.057e-01   5.603 2.11e-08 ***
## MGODPR5      2.737e+00  5.032e-01   5.440 5.32e-08 ***
## MGODPR6      2.412e+00  5.027e-01   4.798 1.60e-06 ***
## MGODPR7      3.350e+00  5.117e-01   6.546 5.90e-11 ***
## MGODPR8      2.761e+00  5.898e-01   4.681 2.85e-06 ***
## MGODPR9      2.629e+00  5.366e-01   4.899 9.63e-07 ***
## MGODOV1     -4.952e-01  9.676e-02  -5.118 3.09e-07 ***
## MGODOV2      3.286e-03  9.521e-02   0.035 0.972471    
## MGODOV3      3.302e-01  1.662e-01   1.987 0.046945 *  
## MGODOV4     -8.363e-01  2.562e-01  -3.264 0.001098 ** 
## MGODOV5      1.790e+00  5.261e-01   3.403 0.000666 ***
## MRELGE1     -1.843e+01  4.200e+02  -0.044 0.965005    
## MRELGE2     -2.574e+00  5.918e-01  -4.350 1.36e-05 ***
## MRELGE3     -9.259e-01  5.304e-01  -1.746 0.080873 .  
## MRELGE4     -1.254e+00  5.151e-01  -2.434 0.014929 *  
## MRELGE5     -1.101e+00  5.007e-01  -2.199 0.027902 *  
## MRELGE6     -1.389e+00  5.079e-01  -2.734 0.006259 ** 
## MRELGE7     -9.151e-01  5.043e-01  -1.815 0.069593 .  
## MRELGE8     -8.566e-01  5.225e-01  -1.639 0.101112    
## MRELGE9     -8.156e-01  5.074e-01  -1.608 0.107941    
## MRELSA1     -1.420e-02  9.495e-02  -0.150 0.881096    
## MRELSA2     -4.716e-03  1.130e-01  -0.042 0.966707    
## MRELSA3      7.580e-01  2.391e-01   3.170 0.001522 ** 
## MRELSA4     -1.907e+00  5.138e-01  -3.712 0.000206 ***
## MRELSA5     -1.758e+01  1.243e+03  -0.014 0.988718    
## MRELSA6     -1.867e+01  1.548e+03  -0.012 0.990376    
## MRELSA7     -8.653e-01  6.536e+03   0.000 0.999894    
## MOPLMIDD1   -9.922e-02  3.135e-01  -0.316 0.751641    
## MOPLMIDD2   -6.028e-01  2.685e-01  -2.245 0.024769 *  
## MOPLMIDD3   -9.626e-01  2.598e-01  -3.706 0.000211 ***
## MOPLMIDD4   -1.279e+00  2.625e-01  -4.873 1.10e-06 ***
## MOPLMIDD5   -1.180e+00  2.732e-01  -4.319 1.57e-05 ***
## MOPLMIDD6   -1.360e+00  3.014e-01  -4.513 6.38e-06 ***
## MOPLMIDD7   -1.377e+00  3.375e-01  -4.079 4.52e-05 ***
## MOPLMIDD8   -2.117e+00  4.896e-01  -4.324 1.53e-05 ***
## MOPLMIDD9   -2.567e+00  4.502e-01  -5.702 1.19e-08 ***
## MOPLLAAG1   -9.185e-01  2.866e-01  -3.205 0.001353 ** 
## MOPLLAAG2   -6.587e-01  2.497e-01  -2.638 0.008348 ** 
## MOPLLAAG3   -1.006e+00  2.485e-01  -4.048 5.18e-05 ***
## MOPLLAAG4   -1.719e+00  2.584e-01  -6.654 2.84e-11 ***
## MOPLLAAG5   -2.011e+00  2.761e-01  -7.285 3.22e-13 ***
## MOPLLAAG6   -2.161e+00  3.003e-01  -7.197 6.17e-13 ***
## MOPLLAAG7   -2.417e+00  3.378e-01  -7.154 8.43e-13 ***
## MOPLLAAG8   -2.461e+00  4.146e-01  -5.934 2.95e-09 ***
## MOPLLAAG9   -4.682e+00  4.547e-01 -10.296  < 2e-16 ***
## MBERHOOG1   -1.462e-01  1.170e-01  -1.250 0.211457    
## MBERHOOG2    6.360e-01  1.321e-01   4.814 1.48e-06 ***
## MBERHOOG3    2.081e-01  1.833e-01   1.136 0.256058    
## MBERHOOG4    9.875e-01  2.373e-01   4.161 3.17e-05 ***
## MBERHOOG5    6.289e-01  3.184e-01   1.975 0.048292 *  
## MBERHOOG6    2.216e+00  3.834e-01   5.780 7.45e-09 ***
## MBERHOOG7    2.762e+00  4.777e-01   5.783 7.34e-09 ***
## MBERHOOG8    4.752e+00  6.885e-01   6.902 5.13e-12 ***
## MBERHOOG9    1.365e+00  6.605e-01   2.067 0.038721 *  
## MBERZELF1    4.653e-01  1.014e-01   4.590 4.44e-06 ***
## MBERZELF2    5.454e-01  1.493e-01   3.653 0.000260 ***
## MBERZELF3    1.164e+00  4.264e-01   2.730 0.006329 ** 
## MBERZELF4    6.743e-01  7.072e-01   0.954 0.340320    
## MBERZELF5    3.057e-01  4.992e-01   0.612 0.540329    
## MBERBOER1    2.768e-01  1.028e-01   2.691 0.007123 ** 
## MBERBOER2    4.612e-01  1.643e-01   2.807 0.004999 ** 
## MBERBOER3    1.462e+00  2.885e-01   5.066 4.07e-07 ***
## MBERBOER4   -9.216e-02  4.646e-01  -0.198 0.842771    
## MBERBOER5    3.112e+00  5.968e-01   5.215 1.84e-07 ***
## MBERBOER6   -1.218e+01  1.225e+03  -0.010 0.992067    
## MBERBOER7    4.020e+00  3.613e+03   0.001 0.999112    
## MBERBOER8    8.866e+00  2.635e+03   0.003 0.997316    
## MBERBOER9    6.482e+00  3.287e+03   0.002 0.998427    
## MBERMIDD1   -6.315e-01  2.034e-01  -3.105 0.001904 ** 
## MBERMIDD2    8.740e-02  1.753e-01   0.499 0.618005    
## MBERMIDD3    9.349e-02  2.084e-01   0.449 0.653698    
## MBERMIDD4    8.949e-01  2.523e-01   3.546 0.000391 ***
## MBERMIDD5    9.266e-01  3.150e-01   2.942 0.003265 ** 
## MBERMIDD6    1.623e+00  3.723e-01   4.360 1.30e-05 ***
## MBERMIDD7    1.763e+00  4.596e-01   3.835 0.000125 ***
## MBERMIDD8   -1.934e+01  1.300e+03  -0.015 0.988126    
## MBERMIDD9    3.016e+00  5.639e-01   5.348 8.89e-08 ***
## MBERARBG1    1.160e+00  1.362e-01   8.516  < 2e-16 ***
## MBERARBG2    1.032e+00  1.588e-01   6.496 8.27e-11 ***
## MBERARBG3    6.719e-01  2.060e-01   3.261 0.001109 ** 
## MBERARBG4    1.046e+00  2.559e-01   4.086 4.39e-05 ***
## MBERARBG5    2.016e+00  3.275e-01   6.154 7.57e-10 ***
## MBERARBG6    5.985e-01  4.186e-01   1.430 0.152808    
## MBERARBG7    4.111e+00  5.750e-01   7.150 8.67e-13 ***
## MBERARBG8    3.113e+00  6.222e-01   5.004 5.61e-07 ***
## MBERARBG9    4.942e+00  8.327e-01   5.934 2.95e-09 ***
## MBERARBO1   -3.294e-02  1.316e-01  -0.250 0.802362    
## MBERARBO2    1.613e-01  1.498e-01   1.076 0.281743    
## MBERARBO3    4.333e-01  1.865e-01   2.323 0.020179 *  
## MBERARBO4    1.291e+00  2.363e-01   5.465 4.62e-08 ***
## MBERARBO5    1.468e+00  3.195e-01   4.594 4.35e-06 ***
## MBERARBO6    1.256e+00  4.096e-01   3.066 0.002170 ** 
## MBERARBO7    1.941e+00  6.203e-01   3.129 0.001753 ** 
## MBERARBO8    3.977e+00  8.972e-01   4.433 9.28e-06 ***
## MBERARBO9   -1.585e+01  9.171e+02  -0.017 0.986212    
## MSKC1        5.737e-01  2.724e-01   2.106 0.035208 *  
## MSKC2        5.813e-01  2.379e-01   2.443 0.014548 *  
## MSKC3        5.408e-01  2.340e-01   2.311 0.020811 *  
## MSKC4        8.697e-01  2.478e-01   3.509 0.000449 ***
## MSKC5        8.780e-01  2.537e-01   3.461 0.000539 ***
## MSKC6        1.444e+00  2.841e-01   5.082 3.74e-07 ***
## MSKC7        1.322e+00  3.159e-01   4.185 2.85e-05 ***
## MSKC8        2.530e+00  3.998e-01   6.327 2.49e-10 ***
## MSKC9        9.878e-02  4.500e-01   0.220 0.826247    
## MSKD1        8.764e-02  9.576e-02   0.915 0.360110    
## MSKD2       -4.269e-02  1.146e-01  -0.372 0.709546    
## MSKD3       -4.204e-01  1.661e-01  -2.531 0.011376 *  
## MSKD4       -6.902e-01  2.617e-01  -2.638 0.008341 ** 
## MSKD5       -1.923e+00  6.259e-01  -3.072 0.002128 ** 
## MSKD6       -1.597e+01  5.245e+02  -0.030 0.975704    
## MSKD7        3.037e+00  1.172e+00   2.591 0.009560 ** 
## MSKD9       -1.412e+01  6.523e+03  -0.002 0.998273    
## MHKOOP1     -5.018e-01  1.799e-01  -2.789 0.005288 ** 
## MHKOOP2     -1.101e+00  1.719e-01  -6.408 1.47e-10 ***
## MHKOOP3     -3.033e-01  1.637e-01  -1.853 0.063837 .  
## MHKOOP4     -3.893e-01  1.646e-01  -2.366 0.017996 *  
## MHKOOP5     -3.748e-01  1.586e-01  -2.362 0.018159 *  
## MHKOOP6     -8.647e-02  1.572e-01  -0.550 0.582292    
## MHKOOP7     -6.817e-01  1.521e-01  -4.482 7.40e-06 ***
## MHKOOP8     -3.731e-01  1.651e-01  -2.260 0.023796 *  
## MHKOOP9     -9.742e-02  1.516e-01  -0.643 0.520548    
## MAUT11       9.743e+00  8.002e+03   0.001 0.999029    
## MAUT12       1.279e+01  4.439e+03   0.003 0.997702    
## MAUT13       1.253e+01  4.439e+03   0.003 0.997748    
## MAUT14       1.516e+01  4.439e+03   0.003 0.997275    
## MAUT15       1.581e+01  4.439e+03   0.004 0.997158    
## MAUT16       1.592e+01  4.439e+03   0.004 0.997138    
## MAUT17       1.625e+01  4.439e+03   0.004 0.997079    
## MAUT18       1.646e+01  4.439e+03   0.004 0.997041    
## MAUT19       1.575e+01  4.439e+03   0.004 0.997169    
## MAUT21       2.152e-01  1.170e-01   1.840 0.065820 .  
## MAUT22      -1.704e-01  1.624e-01  -1.049 0.294169    
## MAUT23      -3.760e-01  2.579e-01  -1.458 0.144865    
## MAUT24       4.427e-01  3.441e-01   1.287 0.198221    
## MAUT25       1.446e-01  5.945e-01   0.243 0.807762    
## MAUT26       3.485e+00  1.163e+00   2.996 0.002733 ** 
## MAUT27       2.589e+00  6.617e+03   0.000 0.999688    
## MAUT01      -3.413e-02  1.432e-01  -0.238 0.811668    
## MAUT02       3.263e-01  1.663e-01   1.962 0.049757 *  
## MAUT03       7.060e-01  2.521e-01   2.800 0.005106 ** 
## MAUT04      -9.803e-01  3.538e-01  -2.771 0.005590 ** 
## MAUT05       6.194e-01  5.090e-01   1.217 0.223618    
## MAUT06       3.628e+00  8.819e-01   4.114 3.89e-05 ***
## MAUT07      -5.926e+01  1.117e+03  -0.053 0.957700    
## MAUT08      -5.818e+00  6.760e+03  -0.001 0.999313    
## MAUT09      -3.893e+00  4.591e+03  -0.001 0.999323    
## MINK30451   -1.238e-01  2.049e-01  -0.604 0.545630    
## MINK30452    2.546e-01  1.592e-01   1.599 0.109756    
## MINK30453    1.471e-01  1.530e-01   0.961 0.336555    
## MINK30454   -3.928e-02  1.552e-01  -0.253 0.800180    
## MINK30455   -1.862e-01  1.646e-01  -1.131 0.257924    
## MINK30456   -2.170e-01  1.809e-01  -1.200 0.230295    
## MINK30457    1.232e-01  2.224e-01   0.554 0.579541    
## MINK30458    7.610e-02  3.992e-01   0.191 0.848798    
## MINK30459    1.589e+00  2.904e-01   5.470 4.50e-08 ***
## MINK75121    5.293e-01  9.215e-02   5.744 9.24e-09 ***
## MINK75122    1.609e-01  1.082e-01   1.488 0.136831    
## MINK75123    1.830e-01  1.545e-01   1.184 0.236218    
## MINK75124    7.571e-01  1.994e-01   3.798 0.000146 ***
## MINK75125   -2.893e-01  2.855e-01  -1.013 0.310921    
## MINK75126   -1.808e+01  1.777e+03  -0.010 0.991882    
## MINK75127   -2.192e+01  6.523e+03  -0.003 0.997318    
## MINK75128   -2.041e+01  2.392e+03  -0.009 0.993194    
## MINK75129    3.738e+00  8.986e-01   4.160 3.19e-05 ***
## MINK123M1   -2.771e-01  1.092e-01  -2.537 0.011172 *  
## MINK123M2    8.420e-02  2.388e-01   0.353 0.724437    
## MINK123M3   -1.565e+00  4.367e-01  -3.585 0.000338 ***
## MINK123M4   -1.856e+01  1.116e+03  -0.017 0.986727    
## MINK123M5   -1.938e+01  6.523e+03  -0.003 0.997629    
## MINK123M7   -2.319e+01  6.523e+03  -0.004 0.997164    
## MINK123M9   -1.710e+01  6.523e+03  -0.003 0.997908    
## MKOOPKLA2    2.495e-01  6.925e-01   0.360 0.718609    
## MKOOPKLA3   -1.578e+01  1.335e+03  -0.012 0.990567    
## MKOOPKLA4   -1.520e+01  1.335e+03  -0.011 0.990911    
## MKOOPKLA5   -1.471e+01  1.335e+03  -0.011 0.991206    
## MKOOPKLA6   -1.297e+01  1.335e+03  -0.010 0.992249    
## MKOOPKLA7   -1.400e+01  1.335e+03  -0.010 0.991630    
## MKOOPKLA8   -1.396e+01  1.335e+03  -0.010 0.991652    
## PPERSAUT4   -1.816e+01  6.523e+03  -0.003 0.997779    
## PPERSAUT5   -1.540e-01  1.167e-01  -1.320 0.186823    
## PPERSAUT6    1.822e+00  6.365e-02  28.627  < 2e-16 ***
## PPERSAUT7   -1.769e+01  8.132e+02  -0.022 0.982642    
## PPERSAUT8   -1.666e+01  4.201e+03  -0.004 0.996835    
## PMOTSCO3     3.417e+00  1.510e+00   2.263 0.023619 *  
## PMOTSCO4    -1.211e-01  1.781e-01  -0.680 0.496465    
## PMOTSCO5     3.240e-01  3.044e-01   1.064 0.287210    
## PMOTSCO6    -2.791e+00  5.439e-01  -5.131 2.89e-07 ***
## PMOTSCO7    -1.912e+01  4.285e+03  -0.004 0.996439    
## PVRAAUT4    -1.905e+01  6.523e+03  -0.003 0.997669    
## PVRAAUT6    -1.727e+01  2.121e+03  -0.008 0.993503    
## PVRAAUT9     4.590e-01  6.573e+03   0.000 0.999944    
## PAANHANG1    8.150e-01  5.014e-01   1.626 0.104050    
## PAANHANG2    1.093e+00  3.780e-01   2.891 0.003843 ** 
## PAANHANG3   -1.636e+01  2.227e+03  -0.007 0.994140    
## PAANHANG4   -1.873e+01  6.523e+03  -0.003 0.997709    
## PAANHANG5   -2.436e+00  6.859e+03   0.000 0.999717    
## PWERKT2     -1.614e+01  2.301e+03  -0.007 0.994404    
## PWERKT3     -1.641e+01  2.063e+03  -0.008 0.993656    
## PWERKT4     -1.512e+01  1.888e+03  -0.008 0.993613    
## PWERKT6     -1.385e+01  3.068e+03  -0.005 0.996398    
## PWAOREG4    -1.878e+01  6.523e+03  -0.003 0.997703    
## PWAOREG5    -1.633e+01  6.523e+03  -0.003 0.998002    
## PWAOREG6     3.056e+00  4.099e-01   7.455 8.98e-14 ***
## PWAOREG7    -1.866e+01  3.214e+03  -0.006 0.995368    
## PPLEZIER1    5.346e+00  1.540e+00   3.472 0.000517 ***
## PPLEZIER2    3.251e-01  1.424e+00   0.228 0.819386    
## PPLEZIER3    2.489e+00  1.419e+00   1.754 0.079478 .  
## PPLEZIER4    1.229e+00  1.177e+00   1.044 0.296664    
## PPLEZIER5   -1.619e+01  3.925e+03  -0.004 0.996709    
## PPLEZIER6    5.117e+01  8.605e+02   0.059 0.952586    
## AWAPART1     3.934e-01  7.312e-02   5.380 7.44e-08 ***
## AWAPART2    -1.807e+01  2.063e+03  -0.009 0.993012    
## AWALAND1    -1.516e+00  2.676e-01  -5.666 1.46e-08 ***
## ABROM1      -7.671e-01  1.497e-01  -5.125 2.97e-07 ***
## ABROM2      -1.739e+01  1.539e+03  -0.011 0.990984    
## ALEVEN1     -9.233e-01  1.783e-01  -5.179 2.23e-07 ***
## ALEVEN2      1.606e-03  1.833e-01   0.009 0.993006    
## ALEVEN3      1.952e-01  4.673e-01   0.418 0.676084    
## ALEVEN4      1.807e+00  6.341e-01   2.849 0.004382 ** 
## ALEVEN8     -8.667e-01  7.758e+03   0.000 0.999911    
## APERSONG1   -4.124e+00  9.580e-01  -4.305 1.67e-05 ***
## AGEZONG1     3.443e-01  3.000e-01   1.148 0.251029    
## ABRAND1      3.499e-01  7.426e-02   4.711 2.46e-06 ***
## ABRAND2      1.019e-01  2.022e-01   0.504 0.614426    
## ABRAND3     -1.891e+01  1.779e+03  -0.011 0.991520    
## ABRAND4     -1.711e+01  2.978e+03  -0.006 0.995414    
## ABRAND5     -1.265e+01  3.790e+03  -0.003 0.997337    
## ABRAND7     -1.600e+01  6.523e+03  -0.002 0.998042    
## APLEZIER1    1.038e+00  1.269e+00   0.817 0.413744    
## APLEZIER2           NA         NA      NA       NA    
## AFIETS1      1.080e+00  1.956e-01   5.523 3.34e-08 ***
## AFIETS2      4.689e-01  3.066e-01   1.529 0.126164    
## AFIETS3      2.962e+00  1.169e+00   2.532 0.011331 *  
## ABYSTAND1    9.915e-01  2.000e-01   4.956 7.18e-07 ***
## ABYSTAND2   -1.855e+01  6.523e+03  -0.003 0.997731    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance:  9723  on 10662  degrees of freedom
## AIC: 10295
## 
## Number of Fisher Scoring iterations: 17
drewMatrix(model.2, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2132  101
##          1 1630  137
##                                           
##                Accuracy : 0.5672          
##                  95% CI : (0.5517, 0.5827)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0355          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.57563         
##             Specificity : 0.56672         
##          Pos Pred Value : 0.07753         
##          Neg Pred Value : 0.95477         
##              Prevalence : 0.05950         
##          Detection Rate : 0.03425         
##    Detection Prevalence : 0.44175         
##       Balanced Accuracy : 0.57118         
##                                           
##        'Positive' Class : 1               
## 
# difference in deviance =  Null deviance (15177.2) - 9766.3 = 5411
# Sensitivity : 64%
# Accuracy : 54%

As we have seen that most of the variables are significant, let’s say MGODRK1 is increased by 1 unit then we can say that the customer is likely to buy caravan insurance. The likelihood of buying insurance is .34.

Applying the GLM from the step-wise model(backwards), we see we have produced a good model. The reason as to why we consider it as an improved model is because we have relatively improved our accuracy and also reduced our variables. Now if we want to further improve our model, we can change our cut-off value to get better results, which in turn will increase specificity. Although the classifications is 0.46 but an average the model is producing good results.

Model 3 with Domain knowledge

Now will create another model which would be our 3rd model. This model is based on our domain knowledge which we have collected by reading a few articles and deciding the factors that play the most important role in determining whether the customer will buy an insurance or not. We spent a good amount of time adding and removing variables to come up with this model, which we assume would be a better one. To make sure it is, we have verified it as well.

corrplot(cor(subset(df_train , select = c("PBRAND", "MOSTYPE", "PPERSAUT", "MKOOPKLA", "MHKOOP", "CARAVAN"))), method = "number", type = "upper")

Simply put, we ran a corerelation matrix among the variables of interest based on our domain knowledge and achieved the following matrix

train_3 = over_train
for(i in 1:ncol(train_3)){
train_3[,i] <- as.factor(train_3[,i])
}
model.3 = glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA+MHKOOP, family = binomial(link = "logit"), 
    data = train_3)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.3$xlevels[["PPERSAUT"]] <- union(model.3$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))

predicted_3 = predict(model.3, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_3 = ifelse(predicted_3>=0.5, 1, 0)

drewSummary(model.3)
## 
## Call:
## glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA + 
##     MHKOOP, family = binomial(link = "logit"), data = train_3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1689  -0.8847   0.2209   0.9066   2.0789  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   14.84840  531.91996   0.028 0.977730    
## PBRAND1       -0.69741    0.20121  -3.466 0.000528 ***
## PBRAND2       -1.10821    0.12976  -8.541  < 2e-16 ***
## PBRAND3        0.70967    0.06415  11.062  < 2e-16 ***
## PBRAND4        0.72419    0.05625  12.875  < 2e-16 ***
## PBRAND5       -0.16518    0.14691  -1.124 0.260876    
## PBRAND6       -0.59417    0.18178  -3.269 0.001081 ** 
## PBRAND7      -16.25358  655.68587  -0.025 0.980224    
## PBRAND8      -16.51102 2399.54472  -0.007 0.994510    
## MOSTYPE2      -2.48726    0.54586  -4.557 5.20e-06 ***
## MOSTYPE3      -2.44264    0.52563  -4.647 3.37e-06 ***
## MOSTYPE4      -3.38937    0.58376  -5.806 6.39e-09 ***
## MOSTYPE5      -0.86187    1.15298  -0.748 0.454753    
## MOSTYPE6      -0.31446    0.18655  -1.686 0.091857 .  
## MOSTYPE7      -2.93594    0.56682  -5.180 2.22e-07 ***
## MOSTYPE8      -0.38850    0.35911  -1.082 0.279316    
## MOSTYPE9      -2.06787    1.09704  -1.885 0.059437 .  
## MOSTYPE10     -0.97191    0.19183  -5.067 4.05e-07 ***
## MOSTYPE11     -2.91971    0.52146  -5.599 2.16e-08 ***
## MOSTYPE12     -0.39390    0.36022  -1.094 0.274172    
## MOSTYPE13     -2.27062    0.52318  -4.340 1.42e-05 ***
## MOSTYPE15    -31.43989 1149.29323  -0.027 0.978176    
## MOSTYPE16    -32.34378  737.64043  -0.044 0.965026    
## MOSTYPE17    -18.27821  724.48272  -0.025 0.979872    
## MOSTYPE18    -32.03103  719.26840  -0.045 0.964480    
## MOSTYPE19    -17.66798 1224.71101  -0.014 0.988490    
## MOSTYPE20    -15.79842  531.91971  -0.030 0.976306    
## MOSTYPE21    -32.06790  779.25911  -0.041 0.967175    
## MOSTYPE22    -16.88569  531.91970  -0.032 0.974676    
## MOSTYPE23     -1.51402    1.12995  -1.340 0.180277    
## MOSTYPE24    -16.89003  531.91971  -0.032 0.974669    
## MOSTYPE25    -16.27280  531.91996  -0.031 0.975594    
## MOSTYPE26    -17.07803  531.92006  -0.032 0.974387    
## MOSTYPE27    -16.36495  531.92004  -0.031 0.975456    
## MOSTYPE28    -32.07183  685.79372  -0.047 0.962700    
## MOSTYPE29     -1.80197    1.14436  -1.575 0.115335    
## MOSTYPE30    -17.30764  531.91973  -0.033 0.974043    
## MOSTYPE31    -16.86894  531.91997  -0.032 0.974701    
## MOSTYPE32    -15.60633  531.91996  -0.029 0.976594    
## MOSTYPE33     -0.99299    1.12335  -0.884 0.376719    
## MOSTYPE34     -2.87946    0.53378  -5.395 6.87e-08 ***
## MOSTYPE35     -3.00097    1.06319  -2.823 0.004763 ** 
## MOSTYPE36     -0.37743    1.12784  -0.335 0.737890    
## MOSTYPE37     -1.46596    1.09355  -1.341 0.180068    
## MOSTYPE38     -1.31019    1.09405  -1.198 0.231089    
## MOSTYPE39     -2.20077    1.06283  -2.071 0.038390 *  
## MOSTYPE40    -17.22686  254.62643  -0.068 0.946060    
## MOSTYPE41     -2.66862    1.10137  -2.423 0.015393 *  
## PPERSAUT4    -15.52662 2399.54472  -0.006 0.994837    
## PPERSAUT5     -0.06279    0.09359  -0.671 0.502287    
## PPERSAUT6      1.57751    0.05024  31.398  < 2e-16 ***
## PPERSAUT7    -15.57298  333.14133  -0.047 0.962716    
## PPERSAUT8    -15.12060 1126.54079  -0.013 0.989291    
## MKOOPKLA2      0.39402    0.51798   0.761 0.446843    
## MKOOPKLA3    -15.51669  531.91877  -0.029 0.976728    
## MKOOPKLA4    -14.43144  531.91883  -0.027 0.978355    
## MKOOPKLA5    -13.93127  531.91889  -0.026 0.979105    
## MKOOPKLA6    -13.52861  531.91969  -0.025 0.979709    
## MKOOPKLA7    -15.16099  531.91983  -0.029 0.977261    
## MKOOPKLA8    -15.99471  531.91993  -0.030 0.976011    
## MHKOOP1       -0.11577    0.11516  -1.005 0.314793    
## MHKOOP2       -0.29700    0.11981  -2.479 0.013179 *  
## MHKOOP3        0.38327    0.11613   3.301 0.000965 ***
## MHKOOP4        0.19255    0.10873   1.771 0.076579 .  
## MHKOOP5       -0.01002    0.10802  -0.093 0.926102    
## MHKOOP6        0.48161    0.10298   4.677 2.91e-06 ***
## MHKOOP7        0.13166    0.10172   1.294 0.195534    
## MHKOOP8        0.65145    0.11310   5.760 8.42e-09 ***
## MHKOOP9        0.52022    0.09889   5.261 1.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 11912  on 10879  degrees of freedom
## AIC: 12050
## 
## Number of Fisher Scoring iterations: 15
drewMatrix(model.3, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2350   84
##          1 1412  154
##                                          
##                Accuracy : 0.626          
##                  95% CI : (0.6108, 0.641)
##     No Information Rate : 0.9405         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0752         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.64706        
##             Specificity : 0.62467        
##          Pos Pred Value : 0.09834        
##          Neg Pred Value : 0.96549        
##              Prevalence : 0.05950        
##          Detection Rate : 0.03850        
##    Detection Prevalence : 0.39150        
##       Balanced Accuracy : 0.63586        
##                                          
##        'Positive' Class : 1              
## 
getRMSE(predictedClass_3)
## [1] 0.842615
# Area under curve
pr <- prediction(predictedClass_3,over_test$CARAVAN)
perf <- performance(pr,measure = "tpr",x.measure = "fpr")
plot(perf) > auc(over_test$CARAVAN,predictedClass_3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## logical(0)
auc_ROCR <- performance(pr, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]


pR2(model.3)['McFadden']
## fitting null model for pseudo-r2
##  McFadden 
## 0.2151129
# difference in deviance =  Null deviance (15177) - 12051 = 3025
# sensivity 72%
# Accuracy : 60%

For this model, we created a confusion matrix along with ROC. We see that the accuracy is approximately 60%, while sensitivity is also 60% and specificity is roughly 60% as well.

AUC ROC plot logistic regression Our AUC score is 0.59. This means that, we are relatively closer to 1 and we know that the closer we are to 1 the better.

Evaluting performance of model 3

pred_t <- predict(model.3, na.action=na.pass)
hist(pred_t)

boxplot(pred_t)

##Plotting residual histograms for training and validation data
resid.t<-residuals(model.3)
hist(resid.t)

From the above histogram we can see the range of our residuals is between -2 and 2.

ROC Model 3

drewROC(model.3)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

From the above ROC curve we can see that if we compare it with the previous ROC the area we have achieved a higher area under the curve with respect tot the ROC.

Lift chart

lift.example <- lift(relevel(as.factor(over_test$CARAVAN), ref="1") ~ predicted_3, data = over_test)
#xyplot(lift.example, plot = "gain")

Decil Wise chart

library(gains)
actual = as.numeric(over_test$CARAVAN)
predicted_3_num = as.numeric(predicted_3)
gain = gains(actual, predicted_3_num)

barplot(gain$mean.resp / mean(actual), names.arg = gain$depth, xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise lift chart")

Since our data is categorical, we need to assess the predictive performance our models. One method to do so is by looking at the Decile-Wise chart. The idea behind Decide wise chart is that it divides the data into 10 bins. Each bin tells us the % by which the model is able to explain the predictive performance. It must also be kept in mind that a good decile was chart is sort of downward slopping from left to right. As we can see in this chart, that we achieved, somewhat a similar thing. The predictive performance of our models is good because at each bin we touch about 1% of the probability.

train_4 = over_train
for(i in 1:ncol(train_4)){
train_4[,i] <- as.factor(train_4[,i])
}

Model 4 - Decision tree.

fit1 = rpart(formula=CARAVAN ~ .,data=over_train,method = 'class', control=rpart.control(minsplit=20, minbucket=1, cp=0.008))


printcp(fit1)
## 
## Classification tree:
## rpart(formula = CARAVAN ~ ., data = over_train, method = "class", 
##     control = rpart.control(minsplit = 20, minbucket = 1, cp = 0.008))
## 
## Variables actually used in tree construction:
##  [1] MBERHOOG MBERMIDD MFALLEEN MGODGE   MHKOOP   MINKGEM  MOSTYPE  MSKB2   
##  [9] MSKC     PBRAND   PPERSAUT
## 
## Root node error: 5474/10948 = 0.5
## 
## n= 10948 
## 
##          CP nsplit rel error  xerror      xstd
## 1 0.3818049      0   1.00000 1.02320 0.0095547
## 2 0.0277676      1   0.61820 0.61838 0.0088339
## 3 0.0217391      3   0.56266 0.57033 0.0086301
## 4 0.0105042      4   0.54092 0.55225 0.0085457
## 5 0.0088905      6   0.51991 0.51096 0.0083364
## 6 0.0080000     13   0.44574 0.47077 0.0081091
fancyRpartPlot(fit1)

glm_6 = glm(formula = CARAVAN ~ PPERSAUT + MBERHOOG +MGODPR  + MHKOOP +MINKGEM  +MINKM30  +MOSTYPE  +PBRAND   +PBROM, family = binomial(link = "logit"),data = train_4)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_6$xlevels[["PPERSAUT"]] <- union(glm_6$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
summary(glm_6)
## 
## Call:
## glm(formula = CARAVAN ~ PPERSAUT + MBERHOOG + MGODPR + MHKOOP + 
##     MINKGEM + MINKM30 + MOSTYPE + PBRAND + PBROM, family = binomial(link = "logit"), 
##     data = train_4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5544  -0.8385   0.1426   0.8595   1.9681  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.979e+01  4.292e+02  -0.046 0.963221    
## PPERSAUT4   -1.481e+01  2.400e+03  -0.006 0.995075    
## PPERSAUT5   -7.410e-02  9.685e-02  -0.765 0.444243    
## PPERSAUT6    1.581e+00  5.307e-02  29.798  < 2e-16 ***
## PPERSAUT7   -1.575e+01  3.249e+02  -0.048 0.961343    
## PPERSAUT8   -1.502e+01  1.002e+03  -0.015 0.988038    
## MBERHOOG1    1.655e-01  7.862e-02   2.105 0.035328 *  
## MBERHOOG2    2.065e-01  7.579e-02   2.725 0.006434 ** 
## MBERHOOG3   -2.120e-02  9.136e-02  -0.232 0.816502    
## MBERHOOG4    2.924e-01  1.115e-01   2.622 0.008740 ** 
## MBERHOOG5   -4.591e-01  1.452e-01  -3.162 0.001565 ** 
## MBERHOOG6    8.413e-02  1.579e-01   0.533 0.594258    
## MBERHOOG7    9.128e-01  1.900e-01   4.805 1.55e-06 ***
## MBERHOOG8    9.297e-01  3.203e-01   2.903 0.003696 ** 
## MBERHOOG9   -2.876e-01  3.884e-01  -0.740 0.459136    
## MGODPR1      1.473e+00  3.797e-01   3.878 0.000105 ***
## MGODPR2      1.761e+00  3.401e-01   5.178 2.24e-07 ***
## MGODPR3      2.011e+00  3.362e-01   5.982 2.20e-09 ***
## MGODPR4      1.891e+00  3.304e-01   5.721 1.06e-08 ***
## MGODPR5      2.090e+00  3.317e-01   6.302 2.95e-10 ***
## MGODPR6      1.939e+00  3.355e-01   5.781 7.42e-09 ***
## MGODPR7      2.659e+00  3.355e-01   7.926 2.27e-15 ***
## MGODPR8      1.533e+00  4.041e-01   3.794 0.000148 ***
## MGODPR9      1.781e+00  3.616e-01   4.924 8.46e-07 ***
## MHKOOP1     -4.522e-01  1.244e-01  -3.633 0.000280 ***
## MHKOOP2     -3.963e-01  1.273e-01  -3.114 0.001846 ** 
## MHKOOP3      1.152e-01  1.271e-01   0.906 0.365026    
## MHKOOP4     -9.811e-02  1.186e-01  -0.827 0.408147    
## MHKOOP5     -4.676e-01  1.177e-01  -3.972 7.13e-05 ***
## MHKOOP6      9.456e-03  1.175e-01   0.080 0.935871    
## MHKOOP7     -3.840e-01  1.165e-01  -3.297 0.000976 ***
## MHKOOP8     -1.893e-02  1.279e-01  -0.148 0.882338    
## MHKOOP9      3.286e-03  1.178e-01   0.028 0.977742    
## MINKGEM1     1.617e+01  4.292e+02   0.038 0.969943    
## MINKGEM2     1.628e+01  4.292e+02   0.038 0.969749    
## MINKGEM3     1.627e+01  4.292e+02   0.038 0.969762    
## MINKGEM4     1.679e+01  4.292e+02   0.039 0.968789    
## MINKGEM5     1.687e+01  4.292e+02   0.039 0.968643    
## MINKGEM6     1.638e+01  4.292e+02   0.038 0.969554    
## MINKGEM7     1.726e+01  4.292e+02   0.040 0.967926    
## MINKGEM8     1.727e+01  4.292e+02   0.040 0.967905    
## MINKGEM9     6.633e-01  6.125e+02   0.001 0.999136    
## MINKM301     3.282e-01  8.967e-02   3.660 0.000252 ***
## MINKM302     5.202e-01  7.935e-02   6.556 5.51e-11 ***
## MINKM303     1.136e-02  9.222e-02   0.123 0.901951    
## MINKM304    -3.912e-01  1.174e-01  -3.332 0.000862 ***
## MINKM305    -8.430e-02  1.284e-01  -0.657 0.511387    
## MINKM306     3.190e-01  1.777e-01   1.795 0.072658 .  
## MINKM307     2.203e-01  2.239e-01   0.984 0.325214    
## MINKM308    -4.193e-01  3.681e-01  -1.139 0.254637    
## MINKM309    -1.548e+01  2.810e+02  -0.055 0.956084    
## MOSTYPE2     2.043e-01  2.478e-01   0.824 0.409801    
## MOSTYPE3     2.224e-01  1.868e-01   1.190 0.233871    
## MOSTYPE4    -8.216e-01  3.213e-01  -2.557 0.010546 *  
## MOSTYPE5    -1.795e-01  3.350e-01  -0.536 0.591997    
## MOSTYPE6    -3.494e-01  2.060e-01  -1.696 0.089872 .  
## MOSTYPE7    -7.681e-01  3.176e-01  -2.419 0.015573 *  
## MOSTYPE8     4.584e-01  1.759e-01   2.606 0.009164 ** 
## MOSTYPE9    -2.203e-01  2.054e-01  -1.072 0.283581    
## MOSTYPE10   -9.310e-01  2.130e-01  -4.371 1.24e-05 ***
## MOSTYPE11   -4.856e-01  2.189e-01  -2.218 0.026521 *  
## MOSTYPE12    4.501e-01  2.152e-01   2.092 0.036457 *  
## MOSTYPE13    9.188e-02  2.024e-01   0.454 0.649794    
## MOSTYPE15   -1.399e+01  1.009e+03  -0.014 0.988943    
## MOSTYPE16   -1.521e+01  4.887e+02  -0.031 0.975172    
## MOSTYPE17   -1.512e+01  7.191e+02  -0.021 0.983224    
## MOSTYPE18   -1.526e+01  4.774e+02  -0.032 0.974500    
## MOSTYPE19   -1.606e+01  1.235e+03  -0.013 0.989626    
## MOSTYPE20    4.775e-01  3.614e-01   1.321 0.186441    
## MOSTYPE21   -1.510e+01  5.837e+02  -0.026 0.979356    
## MOSTYPE22   -3.598e-01  2.593e-01  -1.388 0.165273    
## MOSTYPE23   -4.672e-01  2.371e-01  -1.970 0.048788 *  
## MOSTYPE24   -1.440e-01  2.380e-01  -0.605 0.545264    
## MOSTYPE25    1.020e-02  2.837e-01   0.036 0.971320    
## MOSTYPE26   -9.598e-01  4.031e-01  -2.381 0.017250 *  
## MOSTYPE27   -6.498e-01  3.640e-01  -1.785 0.074251 .  
## MOSTYPE28   -1.606e+01  4.178e+02  -0.038 0.969347    
## MOSTYPE29   -1.074e+00  3.009e-01  -3.571 0.000356 ***
## MOSTYPE30   -5.886e-01  2.519e-01  -2.336 0.019477 *  
## MOSTYPE31   -4.754e-01  2.331e-01  -2.040 0.041351 *  
## MOSTYPE32    7.138e-01  2.246e-01   3.177 0.001486 ** 
## MOSTYPE33   -2.133e-01  1.794e-01  -1.189 0.234558    
## MOSTYPE34   -4.996e-01  2.152e-01  -2.321 0.020285 *  
## MOSTYPE35   -1.072e+00  2.132e-01  -5.029 4.92e-07 ***
## MOSTYPE36    3.005e-01  2.019e-01   1.488 0.136693    
## MOSTYPE37    2.169e-01  2.198e-01   0.987 0.323688    
## MOSTYPE38    6.371e-01  1.942e-01   3.281 0.001035 ** 
## MOSTYPE39    7.777e-03  1.942e-01   0.040 0.968054    
## MOSTYPE40   -1.626e+01  2.480e+02  -0.066 0.947712    
## MOSTYPE41   -1.270e+00  2.312e-01  -5.492 3.97e-08 ***
## PBRAND1     -6.867e-01  2.087e-01  -3.291 0.000999 ***
## PBRAND2     -1.142e+00  1.325e-01  -8.617  < 2e-16 ***
## PBRAND3      7.107e-01  6.666e-02  10.661  < 2e-16 ***
## PBRAND4      7.790e-01  5.966e-02  13.058  < 2e-16 ***
## PBRAND5     -9.026e-02  1.555e-01  -0.581 0.561494    
## PBRAND6     -6.508e-01  1.900e-01  -3.425 0.000614 ***
## PBRAND7     -1.614e+01  6.320e+02  -0.026 0.979630    
## PBRAND8     -1.597e+01  2.400e+03  -0.007 0.994691    
## PBROM2      -8.912e-01  3.930e-01  -2.267 0.023364 *  
## PBROM3      -8.011e-01  1.447e-01  -5.536 3.10e-08 ***
## PBROM4      -1.527e+01  2.698e+02  -0.057 0.954862    
## PBROM5      -2.445e-01  5.191e-01  -0.471 0.637675    
## PBROM6      -1.592e+01  2.400e+03  -0.007 0.994707    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 11274  on 10845  degrees of freedom
## AIC: 11480
## 
## Number of Fisher Scoring iterations: 15
predicted_6 = predict(glm_6, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_6 = ifelse(predicted_6>=0.5, 1, 0)


confusionMatrix(as.factor(predictedClass_6), as.factor(over_test$CARAVAN), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2514  101
##          1 1248  137
##                                           
##                Accuracy : 0.6628          
##                  95% CI : (0.6479, 0.6774)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0749          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.57563         
##             Specificity : 0.66826         
##          Pos Pred Value : 0.09892         
##          Neg Pred Value : 0.96138         
##              Prevalence : 0.05950         
##          Detection Rate : 0.03425         
##    Detection Prevalence : 0.34625         
##       Balanced Accuracy : 0.62195         
##                                           
##        'Positive' Class : 1               
## 
accuracy(predictedClass_6, as.numeric(over_test$CARAVAN))
##               ME      RMSE     MAE     MPE    MAPE
## Test set 0.71325 0.8739279 0.71325 67.0875 67.0875
pR2(glm_6)['McFadden']
## fitting null model for pseudo-r2
##  McFadden 
## 0.2571451
# difference in deviance =  Null deviance (15177) - 12069 = 3108
# Accuracy 68%
# Sensitivity 58%

Now we ran another model based on decision tree. The good thing about decision tree is that it gives us variables of most importance having a a greater impact on our response variable. From the decision tree we are able to see that 5 variables are of most importance. The above model is ran at cut-off .5 and we achieved an accuracy of 68%, sensitivity of 58% and specificity of 68%.

However, we ran the model with cut-off at .4 as well. We observed that while accuracy decreased, specificity increased.

ROC model 4

drewROC(glm_6)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Model 1 and 2 comparison

anova(logit.reg,model.2,test = 'Chisq')
## Analysis of Deviance Table
## 
## Model 1: CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c + MGEMLEEF_c
## Model 2: CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV + MRELGE + MRELSA + 
##     MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF + MBERBOER + MBERMIDD + 
##     MBERARBG + MBERARBO + MSKC + MSKD + MHKOOP + MAUT1 + MAUT2 + 
##     MAUT0 + MINK3045 + MINK7512 + MINK123M + MKOOPKLA + PPERSAUT + 
##     PMOTSCO + PVRAAUT + PAANHANG + PWERKT + PWAOREG + PPLEZIER + 
##     AWAPART + AWALAND + ABROM + ALEVEN + APERSONG + AGEZONG + 
##     ABRAND + APLEZIER + AFIETS + ABYSTAND
##   Resid. Df Resid. Dev  Df Deviance  Pr(>Chi)    
## 1     10931      14363                           
## 2     10662       9723 269   4640.4 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model 3 and 4 comparison

anova(model.3,glm_6,test = 'Chisq')
## Analysis of Deviance Table
## 
## Model 1: CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA + MHKOOP
## Model 2: CARAVAN ~ PPERSAUT + MBERHOOG + MGODPR + MHKOOP + MINKGEM + MINKM30 + 
##     MOSTYPE + PBRAND + PBROM
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     10879      11912                          
## 2     10845      11274 34   637.93 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1